From 94bbb6cdaf17f4691e6e2750e5ad4cf39044f218 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:11:53 -0700
Subject: [PATCH 001/323] Update test_xla_sharding.cpp

---
 test/cpp/test_xla_sharding.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index e1f908b5c80..a17031f148e 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -435,5 +435,17 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
           ->HasValue());
 }
 
+TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
+  // Build simple addition.
+  xla::XlaBuilder b("builder");
+  auto x = xla::Parameter(&b, /*parameter_number=*/0, shape, "p0");
+  xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
+  auto zzz = xla::Parameter(&b, /*parameter_number=*/1, shape2, "p1");
+  auto y = xla::Add(x, xla::ConstantR0<float>(&b, 3));
+  xla::XlaComputation xla_computation =
+      ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
+}
+
 }  // namespace cpp_test
 }  // namespace torch_xla

From aacaa6279985aa4812f9bc696b097b8f0fb12574 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:12:40 -0700
Subject: [PATCH 002/323] Update
 test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py

---
 ...while_loop_simple_add_dispatch_in_torch.py | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index a76197cc736..e4c5218ccf5 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -18,6 +18,19 @@ def _fake_while_loop(cond_fn, body_fn, operands):
     operands = body_fn(*operands)
   return operands
 
+def _fake_fori_loop(lower, upper, body_fun, *init_val):
+  # operands need to be more than one here
+  # print("upper - lower: ", upper - lower)
+  # print("init_val: ", init_val)
+  # print("type init_val: ", type(init_val))
+  (a, b) = init_val
+  # print("a: ", a)
+  # print("b: ", b)
+  for i in range((upper - lower)[0]):
+    a = body_fun(a, b)
+    # print("a: ", a)
+    # print("i: ", i)
+  return a
 
 def _fake_fori_loop(lower, upper, body_fun, *init_val):
   (plus_value, init_val) = init_val
@@ -64,23 +77,23 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
-  def test_while_loop_tpu_subtraction_nested(self):
+  def test_fori_loop_tpu_addition(self):
 
+    xm.mark_step()
     device = xm.xla_device()
 
-    def cond_fn(init, limit_value):
-      return limit_value[0] <= init[0]
+    lower = torch.tensor([2], dtype=torch.int32, device=device)
+    upper = torch.tensor([52], dtype=torch.int32, device=device)
+    one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    init_val = torch.tensor([1], dtype=torch.int32, device=device)
 
-    def body_fn(init, limit_value):
-      one_value = torch.ones(1, dtype=torch.int32, device=device)
-      two_value = limit_value.clone()
-      return (torch.sub(torch.sub(init, one_value), one_value), two_value)
+    def body_fun(a, b):
+      return torch.add(a, b)
 
-    init = torch.tensor([10], dtype=torch.int32, device=device)
-    limit_value = torch.tensor([0], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, (init, limit_value))
-    expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
-    self.assertEqual(expected, res)
+    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
+    print("expected: ", expected)
+    self.assertEqual(expected, res_)
 
   def test_fori_loop_tpu_addition(self):
 

From a2f7062689c98d538c7971618411438b94f8a995 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:14:46 -0700
Subject: [PATCH 003/323] Update init_python_bindings.cpp

---
 torch_xla/csrc/init_python_bindings.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index c603e5d27a5..8890296a61f 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -889,7 +889,17 @@ class PyLoweringContext {
       : lowering_ctx("PyLoweringContext", device) {}
 
   // Builds a HLO graph given a set of output tensors.
-  void Build(std::vector<at::Tensor> tensors) {
+  void Build(std::vector<at::Tensor> tensors, std::vector<at::Tensor> input_arguments) {
+    if (GetNameString() == "condctx") {
+      xla::XlaBuilder* local_builder = lowering_ctx.builder();
+      int64_t parameters_number_i = 2;
+      for (at::Tensor input_argument : input_arguments) {
+        xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
+        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape, "UnusedArgumentsPlaceholder");
+        parameters_number_i = parameters_number_i + 1;
+      }
+    }
+
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
         GetXlaTensors(tensors, /*want_all=*/true);

From bd4ff83036211feb9fd2f106d78de6b46efb8b05 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:15:18 -0700
Subject: [PATCH 004/323] Update fori_loop.py

---
 torch_xla/experimental/fori_loop.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index bf32a712f3e..3533949fccd 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -11,6 +11,28 @@
 import torch._higher_order_ops.while_loop
 from torch._higher_order_ops.while_loop import while_loop_op
 
+def fori_loop(lower, upper, body_fun, one_value, init_val):
+
+  device = xm.xla_device()
+
+  def cond_fn(upper, lower, x):
+    return lower[0] < upper[0]
+
+  def body_fn(upper, lower, x):
+    one_value = torch.ones(1, dtype=torch.int32, device=device)
+    return (torch.sub(upper, one_value), lower, body_fun(one_value, x))
+
+  def old_cond_fn(one_value, lower, upper, init_val): 
+    lower_compare = torch.add(lower, one_value)
+    return lower_compare[0] <= upper[0]
+
+  def old_body_fn(one_value, lower, upper, init_val):
+    new_lower = torch.add(lower, one_value)
+    new_init_val = body_fun(init_val, one_value)
+    return (one_value, new_lower, upper, new_init_val)
+
+  res = _xla_while_loop(cond_fn, body_fn, lower, upper, init_val)
+  return res
 
 def fori_loop(lower, upper, user_body_func, *init_val):
 

From 5765405e9ea751b5f2d786140d20a3e8700b6c44 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:26:59 -0700
Subject: [PATCH 005/323] format

---
 torch_xla/csrc/init_python_bindings.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 8890296a61f..c876e56bd8e 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -889,13 +889,16 @@ class PyLoweringContext {
       : lowering_ctx("PyLoweringContext", device) {}
 
   // Builds a HLO graph given a set of output tensors.
-  void Build(std::vector<at::Tensor> tensors, std::vector<at::Tensor> input_arguments) {
+  void Build(std::vector<at::Tensor> tensors,
+             std::vector<at::Tensor> input_arguments) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
       for (at::Tensor input_argument : input_arguments) {
-        xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape, "UnusedArgumentsPlaceholder");
+        xla::Shape shape =
+            xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
+        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+                                      "UnusedArgumentsPlaceholder");
         parameters_number_i = parameters_number_i + 1;
       }
     }

From b518a1386df8fcc888599fb705b575fc311e9403 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:29:51 -0700
Subject: [PATCH 006/323] format

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e4c5218ccf5..727ec9dadb8 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -18,6 +18,7 @@ def _fake_while_loop(cond_fn, body_fn, operands):
     operands = body_fn(*operands)
   return operands
 
+
 def _fake_fori_loop(lower, upper, body_fun, *init_val):
   # operands need to be more than one here
   # print("upper - lower: ", upper - lower)
@@ -38,7 +39,6 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
     plus_value, init_val = body_fun(plus_value, init_val)
   return init_val
 
-
 class WhileLoopTest(unittest.TestCase):
 
   def test_while_loop_tpu_subtraction(self):
@@ -90,7 +90,8 @@ def test_fori_loop_tpu_addition(self):
     def body_fun(a, b):
       return torch.add(a, b)
 
-    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value,
+                                     init_val)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     print("expected: ", expected)
     self.assertEqual(expected, res_)

From aacb407c4441b20dcdefa809329136a97686ce39 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:32:22 -0700
Subject: [PATCH 007/323] format

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 3533949fccd..d34e4e8855d 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -11,6 +11,7 @@
 import torch._higher_order_ops.while_loop
 from torch._higher_order_ops.while_loop import while_loop_op
 
+
 def fori_loop(lower, upper, body_fun, one_value, init_val):
 
   device = xm.xla_device()
@@ -22,7 +23,7 @@ def body_fn(upper, lower, x):
     one_value = torch.ones(1, dtype=torch.int32, device=device)
     return (torch.sub(upper, one_value), lower, body_fun(one_value, x))
 
-  def old_cond_fn(one_value, lower, upper, init_val): 
+  def old_cond_fn(one_value, lower, upper, init_val):
     lower_compare = torch.add(lower, one_value)
     return lower_compare[0] <= upper[0]
 
@@ -51,7 +52,6 @@ def body_fn(upper, lower, *init_val):
   res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
   return res
 
-
 @while_loop_op.py_impl(DispatchKey.XLA)
 def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')

From ff115afa9a783e3876f49e2cb1fa4ccc0ab592c2 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 29 Mar 2024 00:07:57 -0700
Subject: [PATCH 008/323] Update init_python_bindings.cpp

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index c876e56bd8e..25ffae673f6 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -890,7 +890,7 @@ class PyLoweringContext {
 
   // Builds a HLO graph given a set of output tensors.
   void Build(std::vector<at::Tensor> tensors,
-             std::vector<at::Tensor> input_arguments) {
+             std::vector<at::Tensor> input_arguments = std::vector::empty) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;

From b53e87faf11a7521a59fcea33cc28afa68cae64a Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 29 Mar 2024 01:27:13 -0700
Subject: [PATCH 009/323] Update init_python_bindings.cpp

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 25ffae673f6..2b0221cb0a1 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -890,7 +890,7 @@ class PyLoweringContext {
 
   // Builds a HLO graph given a set of output tensors.
   void Build(std::vector<at::Tensor> tensors,
-             std::vector<at::Tensor> input_arguments = std::vector::empty) {
+             std::vector<at::Tensor> input_arguments = {}) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;

From 27639c487d503e184cb85f72de25d7e579dc9f14 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 29 Mar 2024 17:47:17 +0000
Subject: [PATCH 010/323] test formal change

---
 ...p_with_while_loop_simple_add_dispatch_in_torch.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 727ec9dadb8..3578434bef3 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -20,17 +20,9 @@ def _fake_while_loop(cond_fn, body_fn, operands):
 
 
 def _fake_fori_loop(lower, upper, body_fun, *init_val):
-  # operands need to be more than one here
-  # print("upper - lower: ", upper - lower)
-  # print("init_val: ", init_val)
-  # print("type init_val: ", type(init_val))
   (a, b) = init_val
-  # print("a: ", a)
-  # print("b: ", b)
   for i in range((upper - lower)[0]):
     a = body_fun(a, b)
-    # print("a: ", a)
-    # print("i: ", i)
   return a
 
 def _fake_fori_loop(lower, upper, body_fun, *init_val):
@@ -55,7 +47,7 @@ def body_fn(init, limit_value):
 
     init = torch.tensor([10], dtype=torch.int32, device=device)
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, (init, limit_value))
+    res = while_loop(cond_fn, body_fn, init, limit_value) # (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
@@ -73,7 +65,7 @@ def body_fn(init, limit_value):
     # TODO(@manfei): init and limit_value has to be torch.tensor.
     init = torch.tensor([0], dtype=torch.int32, device=device)
     limit_value = torch.tensor([10], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, (init, limit_value))
+    res = while_loop(cond_fn, body_fn, init, limit_value) # (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 

From cf8b7bc8ae5f350d9bd81d8b391fc917ba1a7a2a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 29 Mar 2024 17:48:16 +0000
Subject: [PATCH 011/323] test formal change

---
 torch_xla/experimental/fori_loop.py | 95 ++++++++++++++---------------
 1 file changed, 45 insertions(+), 50 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d34e4e8855d..57f7162bf3f 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,45 +12,35 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 
 
-def fori_loop(lower, upper, body_fun, one_value, init_val):
+# TODO(@manfei): delete one_value?
+def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
+  # weight_0 = body_fun.weight
+  # bias_0 = body_fun.bias
+  # one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-  def cond_fn(upper, lower, x):
+  def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0): # , bias_0): # , output_value):
     return lower[0] < upper[0]
 
-  def body_fn(upper, lower, x):
-    one_value = torch.ones(1, dtype=torch.int32, device=device)
-    return (torch.sub(upper, one_value), lower, body_fun(one_value, x))
-
-  def old_cond_fn(one_value, lower, upper, init_val):
-    lower_compare = torch.add(lower, one_value)
-    return lower_compare[0] <= upper[0]
-
-  def old_body_fn(one_value, lower, upper, init_val):
-    new_lower = torch.add(lower, one_value)
-    new_init_val = body_fun(init_val, one_value)
-    return (one_value, new_lower, upper, new_init_val)
-
-  res = _xla_while_loop(cond_fn, body_fn, lower, upper, init_val)
+  def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0): # , bias_0):
+    # weight = body_fun.weight
+    new_lower = torch.add(one_value, lower) ### !!! this matter, torch.add might would change the second argument's value, even we use a new variable to catch the result!!!
+    output_value = body_fun(*input_value) ### !!! due to the output_value is not actually used here, 
+    # --- !!! its original value would not be used, and it would be replaces by the result of body_fun
+    # --- !!! so, due to PTXLA is traced from result tensor, so the arguments `output_value` would not be included in the body_xlacomputation
+    # --- !!! so, we need to modify ini_python_binding.cpp to add a fake arguments in the xlacompputation
+    weight = body_fun.weight
+    bias = body_fun.bias
+    return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+
+  output_value = torch.zeros([20], dtype=torch.float32, device=device)
+  weight_0 = body_fun.weight
+  bias_0 = body_fun.bias
+  one_value = torch.tensor([1], dtype=torch.int32, device=device)
+  res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
   return res
 
-def fori_loop(lower, upper, user_body_func, *init_val):
-
-  device = xm.xla_device()
-
-  def cond_fn(upper, lower, *init_val):
-    return lower[0] < upper[0]
-
-  def body_fn(upper, lower, *init_val):
-    one_value_i = torch.ones(1, dtype=torch.int32, device=device)
-    res_list = list(user_body_func(*init_val))
-    res_list.insert(0, lower)
-    res_list.insert(0, torch.sub(upper, one_value_i))
-    return res_list
-
-  res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
-  return res
 
 @while_loop_op.py_impl(DispatchKey.XLA)
 def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
@@ -59,8 +49,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   if additional_inputs is None:
     additional_inputs = tuple()
-  return _xla_while_loop(
-      cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs)
+  return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
@@ -70,34 +59,27 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
+    #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
 
-  # trans fake_carried_inputs from list(tensor) to list(xla::op)
-  kwargs = {}
-  if type(fake_carried_inputs) is tuple:
-    shapes = xb.tensor_shape(fake_carried_inputs)
-  else:
-    shapes = xb.tensor_shape((fake_carried_inputs))
-  builder = xb.create_builder('test_while')
-  params = []
-  for shape in shapes:
-    p = xb.mkparam(builder, len(params), shape)
-    params.append(p)
-
   # generate cond_fn xlacomputation
-  cond_result = cond_fn(*fake_carried_inputs)
+  # TODO(@manfei): specify which element is for which argument like a,b,c
+  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
-  cond_ctx.buildforiloop([cond_result], list(fake_carried_inputs[2:]))
+  additional_inputs_list = list(fake_carried_inputs[2:])
+  for i in range(len(additional_inputs)):
+    additional_inputs_list.append(additional_inputs[0])
+  cond_ctx.buildforiloop([cond_result], additional_inputs_list)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs)
+  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   body_ctx.buildforiloop(list(body_result), [])
@@ -105,6 +87,18 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
 
+  # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
+  kwargs = {}
+  if type(carried_inputs) is tuple:
+    shapes = xb.tensor_shape(carried_inputs)
+  else:
+    shapes = xb.tensor_shape((carried_inputs))
+  builder = xb.create_builder('test_while')
+  params = []
+  for shape in shapes:
+    p = xb.mkparam(builder, len(params), shape)
+    params.append(p)
+
   # generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
@@ -116,6 +110,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
-                                                 (carried_inputs), computation)
+                                                 (carried_inputs),
+                                                 computation)
 
   return result
\ No newline at end of file

From 945ab7ae07e271342c74fdb8173fcb54c35c4e51 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 29 Mar 2024 17:52:16 +0000
Subject: [PATCH 012/323] test formal change

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3578434bef3..928e70ffc7e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -55,10 +55,12 @@ def test_while_loop_tpu_addition(self):
 
     device = xm.xla_device()
 
-    def cond_fn(init, limit_value):
+    def cond_fn(loop_carry): # init, limit_value):
+      init, limit_value = loop_carry
       return limit_value[0] >= init[0]
 
-    def body_fn(init, limit_value):
+    def body_fn(loop_carry): # init, limit_value):
+      init, limit_value = loop_carry
       one_value = torch.ones(1, dtype=torch.int32, device=device)
       return (torch.add(init, one_value), limit_value.clone())
 
@@ -85,7 +87,6 @@ def body_fun(a, b):
     lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value,
                                      init_val)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
-    print("expected: ", expected)
     self.assertEqual(expected, res_)
 
   def test_fori_loop_tpu_addition(self):

From f561a127a9cff04cacf96b4cfa176ebdb5c5fb29 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 29 Mar 2024 17:52:59 +0000
Subject: [PATCH 013/323] test formal change

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 928e70ffc7e..b418d0f0ba6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -47,7 +47,7 @@ def body_fn(init, limit_value):
 
     init = torch.tensor([10], dtype=torch.int32, device=device)
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, init, limit_value) # (init, limit_value))
+    res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
@@ -67,7 +67,7 @@ def body_fn(loop_carry): # init, limit_value):
     # TODO(@manfei): init and limit_value has to be torch.tensor.
     init = torch.tensor([0], dtype=torch.int32, device=device)
     limit_value = torch.tensor([10], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, init, limit_value) # (init, limit_value))
+    res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 

From 834cb251c4972153c9ddb4b644246f48f1ba54b8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 29 Mar 2024 18:24:04 +0000
Subject: [PATCH 014/323] test formal change

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b418d0f0ba6..55a02a55e48 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -55,12 +55,10 @@ def test_while_loop_tpu_addition(self):
 
     device = xm.xla_device()
 
-    def cond_fn(loop_carry): # init, limit_value):
-      init, limit_value = loop_carry
+    def cond_fn(init, limit_value):
       return limit_value[0] >= init[0]
 
-    def body_fn(loop_carry): # init, limit_value):
-      init, limit_value = loop_carry
+    def body_fn(init, limit_value):
       one_value = torch.ones(1, dtype=torch.int32, device=device)
       return (torch.add(init, one_value), limit_value.clone())
 

From a9814d2c04bde1729c3a6b015f5462b63024accd Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Mon, 1 Apr 2024 10:57:14 -0700
Subject: [PATCH 015/323] Update test_xla_sharding.cpp

---
 test/cpp/test_xla_sharding.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index a17031f148e..b59927cdbf7 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -440,11 +440,15 @@ TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
   // Build simple addition.
   xla::XlaBuilder b("builder");
   auto x = xla::Parameter(&b, /*parameter_number=*/0, shape, "p0");
+  // Add unused parameter before create xlacomputation
   xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
   auto zzz = xla::Parameter(&b, /*parameter_number=*/1, shape2, "p1");
   auto y = xla::Add(x, xla::ConstantR0<float>(&b, 3));
   xla::XlaComputation xla_computation =
       ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
+
+  // Check whether the unused parameter has been included into xlacomputation or not 
+  EXPECT_EQ(xla_computation.GetProgramShape().parameters_size(), 2);
 }
 
 }  // namespace cpp_test

From feb39cace9acd0e1cad5384f7cf565aee921e6db Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Mon, 1 Apr 2024 11:49:42 -0700
Subject: [PATCH 016/323] Update test_xla_sharding.cpp

---
 test/cpp/test_xla_sharding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index b59927cdbf7..55cdc8f1fb4 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -448,7 +448,7 @@ TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
       ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
 
   // Check whether the unused parameter has been included into xlacomputation or not 
-  EXPECT_EQ(xla_computation.GetProgramShape().parameters_size(), 2);
+  EEXPECT_EQ(xla_computation.GetProgramShape()->parameters_size(), 2);
 }
 
 }  // namespace cpp_test

From 8b2cd86eadde4e4ca7a37b37d93cea770ac35d56 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Mon, 1 Apr 2024 11:52:57 -0700
Subject: [PATCH 017/323] format

---
 test/cpp/test_xla_sharding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index 55cdc8f1fb4..e27be283f66 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -447,7 +447,7 @@ TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
   xla::XlaComputation xla_computation =
       ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
 
-  // Check whether the unused parameter has been included into xlacomputation or not 
+  // Check whether the unused parameter has been included into xlacomputation
   EEXPECT_EQ(xla_computation.GetProgramShape()->parameters_size(), 2);
 }
 

From cc1e7ef3a5d19f4c679c63549e762d87b8c9abdf Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Mon, 1 Apr 2024 11:56:01 -0700
Subject: [PATCH 018/323] format

---
 test/cpp/test_xla_sharding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index e27be283f66..167ffd753e7 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -448,7 +448,7 @@ TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
       ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
 
   // Check whether the unused parameter has been included into xlacomputation
-  EEXPECT_EQ(xla_computation.GetProgramShape()->parameters_size(), 2);
+  EXPECT_EQ(xla_computation.GetProgramShape()->parameters_size(), 2);
 }
 
 }  // namespace cpp_test

From 1c54e4b03552c21b4a9dd66002a36253e9050e2d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 2 Apr 2024 18:07:15 +0000
Subject: [PATCH 019/323] upstream

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 55a02a55e48..b92d8e0ee44 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -47,7 +47,7 @@ def body_fn(init, limit_value):
 
     init = torch.tensor([10], dtype=torch.int32, device=device)
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, (init, limit_value))
+    res = while_loop(cond_fn, body_fn, init, limit_value)
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 

From cb3a5f495d0ba41cdc4081b558945a0b87c6db54 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 2 Apr 2024 18:09:11 +0000
Subject: [PATCH 020/323] upstream

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b92d8e0ee44..55a02a55e48 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -47,7 +47,7 @@ def body_fn(init, limit_value):
 
     init = torch.tensor([10], dtype=torch.int32, device=device)
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, init, limit_value)
+    res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 

From 9007442dba389063ae79e0a0ea51cbdbaca25697 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 2 Apr 2024 21:35:42 +0000
Subject: [PATCH 021/323] upstream

---
 torch_xla/experimental/fori_loop.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 57f7162bf3f..da18f2e8683 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,7 +52,9 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
+  print("carried_inputs: ", carried_inputs)
+  print("additional_inputs: ", additional_inputs)
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code

From 5394b464510398cdd120a1e31c9ef1b747a68346 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 2 Apr 2024 23:12:47 +0000
Subject: [PATCH 022/323] test

---
 ...fori_loop_simple_linear_model_test_code.py | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 test/test_fori_loop_simple_linear_model_test_code.py

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
new file mode 100644
index 00000000000..006b602f001
--- /dev/null
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -0,0 +1,193 @@
+import os
+# import unittest
+# from typing import Callable, Dict, List
+
+import torch
+import torch_xla
+# We need to import the underlying implementation function to register with the dispatcher
+import torch_xla.experimental.fori_loop
+from torch_xla.experimental.fori_loop import fori_loop
+# from torch._higher_order_ops.while_loop import while_loop
+import torch_xla.core.xla_model as xm
+# import torch_xla.core.xla_builder as xb
+import torch_xla.utils.utils as xu
+
+torch.set_grad_enabled(False)
+
+device = xm.xla_device()
+
+# --- linear one ---
+# l_in = torch.randn(10, device=xm.xla_device())
+# linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+# l_out = linear(l_in)
+# print("linear one: ", l_out)
+
+# --- while test case ---
+
+lower = torch.tensor([2], dtype=torch.int32, device=device)
+upper = torch.tensor([52], dtype=torch.int32, device=device)
+one_value = torch.tensor([1], dtype=torch.int32, device=device)
+init_val = torch.tensor([1], dtype=torch.int32, device=device)
+# one_one = torch.one(1, dtype=torch.int32, device=device)
+
+# def body_fun(l_in):
+#   # l_in = torch.randn(10, device=xm.xla_device())
+#   linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+#   # l_out = linear(l_in)
+#   return linear(l_in) # torch.add(a, b) # [0])
+linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+
+def body_fun(y, x, l_in_i):
+  # l_in = torch.randn(10, device=xm.xla_device())
+  # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+  l_out = linear_0(l_in_i)
+  # placeholder_func = torch.rand(size = l_out.size(), device = device)
+  # placeholder_input = torch.rand(size = l_in_i.size(), device = device)
+  # return torch.add(y, x), l_out, placeholder_func, placeholder_input # linear_0(l_in_i), linear_0, l_in_i # additional return: body and input-placeholder   # linear(l_in) # torch.add(a, b) # [0])
+  return torch.add(y, x), l_out
+
+# TODO(@manfei), need to create new variable to seperate old/formal HLO/IR
+l_in_0 = torch.randn(10, device=xm.xla_device())
+
+# def body_fun(x, y, l_in):
+#   # l_in = torch.randn(10, device=xm.xla_device())
+#   linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+#   # l_out = linear(l_in)
+#   return torch.add(x, y), linear(l_in) # linear(l_in) # torch.add(a, b) # [0])
+
+# placeholder_func = torch.rand(size = l_out.size(), device = device)
+# placeholder_input = torch.rand(size = l_in_i.size(), device = device)
+print("test code, body_fun: ", body_fun)
+
+lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val, l_in_0) # , placeholder_func, placeholder_input) # , linear_0, l_in_0)
+
+print("lower_: ", lower_)
+print("upper_: ", upper_)
+print("res_: ", res_)
+
+# --- linear two ---
+# l_in_2 = torch.randn(10, device=xm.xla_device())
+# linear_2 = torch.nn.Linear(10, 20).to(xm.xla_device())
+# l_out_2 = linear(l_in_2)
+# print("linear two: ", l_out_2)
+
+# =================================================================================
+
+# import numpy as np
+# # create dummy data for training
+# # x_values = [i for i in range(11)]
+# # x_train = np.array(x_values, dtype=np.float32)
+# # x_train = x_train.reshape(-1, 1)
+
+# # y_values = [2*i + 1 for i in x_values]
+# # y_train = np.array(y_values, dtype=np.float32)
+# # y_train = y_train.reshape(-1, 1)
+
+# batch_size = 2
+
+# train_loader = xu.SampleGenerator(
+#     data=(torch.zeros(batch_size, 1), torch.zeros(batch_size, dtype=torch.float32)),
+#     sample_count=64 // batch_size // xm.xrt_world_size())
+# test_loader = xu.SampleGenerator(
+#     data=(torch.zeros(batch_size, 1, torch.zeros(batch_size, dtype=torch.float32)),
+#     sample_count=32 // batch_size // xm.xrt_world_size())
+
+# # import torch
+# from torch.autograd import Variable
+
+# class linearRegression(torch.nn.Module):
+#     def __init__(self, inputSize, outputSize):
+#         super(linearRegression, self).__init__()
+#         self.linear = torch.nn.Linear(inputSize, outputSize).to(device)
+
+#     def forward(self, x):
+#         out = self.linear(x)
+#         return out
+
+# # --- training ---
+# inputDim = 1        # takes variable 'x' 
+# outputDim = 1       # takes variable 'y'
+# learningRate = 0.01 * xm.xrt_world_size()
+# epochs = 10 # 100
+
+# model = linearRegression(inputDim, outputDim).to(device)
+# # model = MNIST().to(device)
+# ##### For GPU #######
+# # if torch.cuda.is_available():
+# #     model.cuda()
+
+# if xr.using_pjrt():
+#   xm.broadcast_master_param(model)
+
+# criterion = torch.nn.MSELoss() 
+# optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
+
+# for epoch in range(epochs):
+#     # Converting inputs and labels to Variable
+#     # if torch.cuda.is_available():
+#     #     inputs = Variable(torch.from_numpy(x_train).cuda())
+#     #     labels = Variable(torch.from_numpy(y_train).cuda())
+#     # else:
+#     inputs = Variable(torch.from_numpy(x_train)).to(device)
+#     labels = Variable(torch.from_numpy(y_train)).to(device)
+
+#     # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
+#     optimizer.zero_grad()
+
+#     # get output from the model, given the inputs
+#     outputs = model(inputs)
+
+#     # get loss for the predicted output
+#     loss = criterion(outputs, labels)
+#     print(loss)
+#     # get gradients w.r.t to parameters
+#     loss.backward()
+
+#     # update parameters
+#     # optimizer.step()
+#     xm.optimizer_step(optimizer)
+
+#     print('epoch {}, loss {}'.format(epoch, loss.item()))
+
+# # --- while simple test case ---
+
+# # device = xm.xla_device()
+
+# lower = torch.tensor([2], dtype=torch.int32, device=device)
+# upper = torch.tensor([52], dtype=torch.int32, device=device)
+# one_value = torch.tensor([1], dtype=torch.int32, device=device)
+# init_val = torch.tensor([1], dtype=torch.int32, device=device)
+
+# def body_fun(a, b):
+#   return torch.add(a, b) # [0])
+
+# lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+
+# print("lower_: ", lower_)
+# print("upper_: ", upper_)
+# print("res_: ", res_)
+
+# # --- test ---
+# for epoch in range(epochs):
+#     with torch.no_grad(): # we don't need gradients in the testing phase
+#         if torch.cuda.is_available():
+#             predicted = model(Variable(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
+#         else:
+#             predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
+#         print(epoch, "-th prediction finised") # ed result: ", predicted)
+
+# print("do one more prediction")
+# with torch.no_grad(): # we don't need gradients in the testing phase
+#     if torch.cuda.is_available():
+#         predicted = model(Variable(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
+#     else:
+#         predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
+#     print(predicted)
+# print("finished one more prediction")
+
+# # --- draw ---
+# # plt.clf()
+# # plt.plot(x_train, y_train, 'go', label='True data', alpha=0.5)
+# # plt.plot(x_train, predicted, '--', label='Predictions', alpha=0.5)
+# # plt.legend(loc='best')
+# # plt.show()
\ No newline at end of file

From 972bc82f870e26c086670902c8acabf8f08e6938 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 2 Apr 2024 23:27:00 +0000
Subject: [PATCH 023/323] test

---
 torch_xla/experimental/fori_loop.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index da18f2e8683..cb8866e78ad 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -79,6 +79,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
+  cond_hlo_print = xb.get_computation_hlo(cond_computation)
+  print("cond computation: !!!!!!!!!")
+  print(cond_hlo_print)
 
   # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
@@ -88,6 +91,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
+  body_hlo_print = xb.get_computation_hlo(body_computation)
+  print("body computation: !!!!!!!!!")
+  print(body_hlo_print)
 
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
@@ -109,6 +115,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
+  hlo_print = xb.get_computation_hlo(computation)
+  print("while computation: !!!!!!!!!")
+  print(hlo_print)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From 518bdbc124273a6fc547246faab259a7d981b249 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 05:59:59 +0000
Subject: [PATCH 024/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 2b0221cb0a1..a7bed9e0bf9 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -901,6 +901,20 @@ class PyLoweringContext {
                                       "UnusedArgumentsPlaceholder");
         parameters_number_i = parameters_number_i + 1;
       }
+      // hard-code to meet requirement
+      // f32[20], /*index=5*/f32[20,10], s32[10]
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+                                      "OutPutTensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+                                      "WeightTensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+                                      "FinalOneTensor");
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From 16977fdc46ffca76b3157f810b14ad4403a2c93a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 06:03:11 +0000
Subject: [PATCH 025/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index a7bed9e0bf9..36c0e93fd23 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -904,16 +904,16 @@ class PyLoweringContext {
       // hard-code to meet requirement
       // f32[20], /*index=5*/f32[20,10], s32[10]
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
                                       "OutPutTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
-      xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "FinalOneTensor");
     }
 

From ab00e2ba0ecf23f6c113fbde2438147ccbd7b320 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 06:35:29 +0000
Subject: [PATCH 026/323] test

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index cb8866e78ad..715a94cd4be 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -66,6 +66,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
+  print("fake_carried_inputs: ", fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c

From 33f9e070f10160a775d210ce284fe05dde7c3a8d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 06:48:24 +0000
Subject: [PATCH 027/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 36c0e93fd23..3367968de1a 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -894,7 +894,8 @@ class PyLoweringContext {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
-      for (at::Tensor input_argument : input_arguments) {
+      // for (at::Tensor input_argument : input_arguments) {
+      for (int i = 0; i < 2; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
         xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,

From 5fd313ae8ba70190777cd0a5b663f5a77e1ac632 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 06:51:21 +0000
Subject: [PATCH 028/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 3367968de1a..0a6ea4af3d7 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -895,13 +895,13 @@ class PyLoweringContext {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
       // for (at::Tensor input_argument : input_arguments) {
-      for (int i = 0; i < 2; i++) {
-        xla::Shape shape =
-            xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameters_number_i = parameters_number_i + 1;
-      }
+      // for (int i = 0; i < 2; i++) {
+      //   xla::Shape shape =
+      //       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
+      //   xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+      //                                 "UnusedArgumentsPlaceholder");
+      //   parameters_number_i = parameters_number_i + 1;
+      // }
       // hard-code to meet requirement
       // f32[20], /*index=5*/f32[20,10], s32[10]
       parameters_number_i = parameters_number_i + 1;

From a6ba3b892065178d6f7181df5455c6a465671179 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 06:53:47 +0000
Subject: [PATCH 029/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 38 ++++++++++++-------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 0a6ea4af3d7..dbbcf1ecf57 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -894,28 +894,28 @@ class PyLoweringContext {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
-      // for (at::Tensor input_argument : input_arguments) {
+      for (at::Tensor input_argument : input_arguments) {
       // for (int i = 0; i < 2; i++) {
-      //   xla::Shape shape =
-      //       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-      //   xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
-      //                                 "UnusedArgumentsPlaceholder");
-      //   parameters_number_i = parameters_number_i + 1;
-      // }
+        xla::Shape shape =
+            xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
+        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameters_number_i = parameters_number_i + 1;
+      }
       // hard-code to meet requirement
       // f32[20], /*index=5*/f32[20,10], s32[10]
-      parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-                                      "OutPutTensor");
-      parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-                                      "WeightTensor");
-      parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
-      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-                                      "FinalOneTensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+      //                                 "OutPutTensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+      //                                 "WeightTensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+      //                                 "FinalOneTensor");
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From a09457edeb987a5c96a47bd4329f97b28546700b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:08:46 +0000
Subject: [PATCH 030/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index dbbcf1ecf57..140c6755b49 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -894,8 +894,8 @@ class PyLoweringContext {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
-      for (at::Tensor input_argument : input_arguments) {
-      // for (int i = 0; i < 2; i++) {
+      // for (at::Tensor input_argument : input_arguments) {
+      for (int i = 0; i < 5; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
         xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,

From 8ebc7721b8462a7df93dc8ab523c3b2f1b591c34 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:12:50 +0000
Subject: [PATCH 031/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 140c6755b49..04517683ac2 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -895,7 +895,7 @@ class PyLoweringContext {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
       // for (at::Tensor input_argument : input_arguments) {
-      for (int i = 0; i < 5; i++) {
+      for (int i = 0; i < 4; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
         xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
@@ -904,10 +904,10 @@ class PyLoweringContext {
       }
       // hard-code to meet requirement
       // f32[20], /*index=5*/f32[20,10], s32[10]
-      // parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-      //                                 "OutPutTensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+                                      "OutPutTensor");
       // parameters_number_i = parameters_number_i + 1;
       // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
       // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,

From 8bfa5583c14b757c59a4847e4b1eeca9feb06017 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:18:06 +0000
Subject: [PATCH 032/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 04517683ac2..ab1103ad373 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -904,11 +904,11 @@ class PyLoweringContext {
       }
       // hard-code to meet requirement
       // f32[20], /*index=5*/f32[20,10], s32[10]
-      parameters_number_i = parameters_number_i + 1;
+      // parameters_number_i = parameters_number_i + 1;
       xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
                                       "OutPutTensor");
-      // parameters_number_i = parameters_number_i + 1;
+      parameters_number_i = parameters_number_i + 1;
       // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
       // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
       //                                 "WeightTensor");

From 7ed983c93292bade5701df22cfd2db436973f1f3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:22:02 +0000
Subject: [PATCH 033/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index ab1103ad373..588a213e197 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -895,7 +895,7 @@ class PyLoweringContext {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
       // for (at::Tensor input_argument : input_arguments) {
-      for (int i = 0; i < 4; i++) {
+      for (int i = 0; i < 3; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
         xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
@@ -909,10 +909,10 @@ class PyLoweringContext {
       xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
                                       "OutPutTensor");
       parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-      //                                 "WeightTensor");
-      // parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+                                      "WeightTensor");
+      parameters_number_i = parameters_number_i + 1;
       // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
       // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
       //                                 "FinalOneTensor");

From 608191653023bfd2916ecb5ea044cd5daa44c325 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:22:19 +0000
Subject: [PATCH 034/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 588a213e197..2c7709bc2e8 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -895,7 +895,7 @@ class PyLoweringContext {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameters_number_i = 2;
       // for (at::Tensor input_argument : input_arguments) {
-      for (int i = 0; i < 3; i++) {
+      for (int i = 0; i < 2; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
         xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
@@ -913,9 +913,9 @@ class PyLoweringContext {
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
       parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
-      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-      //                                 "FinalOneTensor");
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+                                      "FinalOneTensor");
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From 9456f7b3bafeeecaed8948e0c95966bd4978a7b1 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 07:41:30 +0000
Subject: [PATCH 035/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 13 +++++++++++++
 torch_xla/csrc/lowering_context.cpp     |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 2c7709bc2e8..42e1205f95b 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -692,6 +692,19 @@ std::vector<at::Tensor> XlaUserComputation(
 
 runtime::ComputationClient::ComputationPtr CreateComputation(
     const std::string& name, xla::XlaOp root) {
+  xla::XlaBuilder* local_builder = root.builder();
+  int64_t parameters_number_i = 4;
+  xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+  xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+                                      "OutPutTensor");
+  parameters_number_i = parameters_number_i + 1;
+  xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+  xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+                                      "WeightTensor");
+  parameters_number_i = parameters_number_i + 1;
+  xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+  xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+                                      "FinalOneTensor");
   xla::XlaComputation computation = ConsumeValue(root.builder()->Build(root));
   return std::make_shared<runtime::ComputationClient::Computation>(
       name, std::move(computation));
diff --git a/torch_xla/csrc/lowering_context.cpp b/torch_xla/csrc/lowering_context.cpp
index a530995ca78..39f82a4887b 100644
--- a/torch_xla/csrc/lowering_context.cpp
+++ b/torch_xla/csrc/lowering_context.cpp
@@ -160,6 +160,8 @@ xla::StatusOr<xla::XlaComputation> LoweringContext::BuildXla() {
   if (!root_tuple_.empty() & (root_tuple_.size() == 1) &
       ((get_name_string() == "condctx") or (get_name_string() == "bodyctx"))) {
     xla = builder()->Build(root_tuple_.at(0));
+  // } else if (!root_tuple_.empty() & (root_tuple_.size() == 1) & ) {
+  //   xla = builder()->Build(root_tuple_.at(0));
   } else if (!root_tuple_.empty()) {
     xla::XlaOp root = xla::Tuple(builder(), root_tuple_);
     xla = builder()->Build(root);

From 68199fb8e219fe7ea8da757dad7c67084786b81c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 3 Apr 2024 23:52:59 +0000
Subject: [PATCH 036/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 42e1205f95b..958149b677b 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -702,7 +702,7 @@ runtime::ComputationClient::ComputationPtr CreateComputation(
   xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
   parameters_number_i = parameters_number_i + 1;
-  xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+  xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
   xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "FinalOneTensor");
   xla::XlaComputation computation = ConsumeValue(root.builder()->Build(root));
@@ -926,7 +926,7 @@ class PyLoweringContext {
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
       xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "FinalOneTensor");
     }

From f69403838133703e92f06855244fc0c6fa3f9be6 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 00:23:44 +0000
Subject: [PATCH 037/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 958149b677b..5f4241e603d 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -915,7 +915,7 @@ class PyLoweringContext {
                                       "UnusedArgumentsPlaceholder");
         parameters_number_i = parameters_number_i + 1;
       }
-      // hard-code to meet requirement
+      // hard-code to meet requirement by change cond xlacomputation
       // f32[20], /*index=5*/f32[20,10], s32[10]
       // parameters_number_i = parameters_number_i + 1;
       xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
@@ -926,7 +926,7 @@ class PyLoweringContext {
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
       xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "FinalOneTensor");
     }

From f8b4cb1103b90603dd52857a067057b3881d06d6 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 00:28:14 +0000
Subject: [PATCH 038/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 5f4241e603d..a83824e856b 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -693,18 +693,18 @@ std::vector<at::Tensor> XlaUserComputation(
 runtime::ComputationClient::ComputationPtr CreateComputation(
     const std::string& name, xla::XlaOp root) {
   xla::XlaBuilder* local_builder = root.builder();
-  int64_t parameters_number_i = 4;
-  xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-  xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-                                      "OutPutTensor");
-  parameters_number_i = parameters_number_i + 1;
-  xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-  xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-                                      "WeightTensor");
-  parameters_number_i = parameters_number_i + 1;
-  xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-  xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-                                      "FinalOneTensor");
+  // int64_t parameters_number_i = 4;
+  // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+  // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+  //                                     "OutPutTensor");
+  // parameters_number_i = parameters_number_i + 1;
+  // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+  // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+  //                                     "WeightTensor");
+  // parameters_number_i = parameters_number_i + 1;
+  // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+  // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+  //                                     "FinalOneTensor");
   xla::XlaComputation computation = ConsumeValue(root.builder()->Build(root));
   return std::make_shared<runtime::ComputationClient::Computation>(
       name, std::move(computation));

From 0e16a6ed8355b7b4c817601e8d4b1c6cd7934b8f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:09:25 +0000
Subject: [PATCH 039/323] test

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 715a94cd4be..9ec4d2923b8 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -61,6 +61,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
+    print("type carried_input: ", type(carried_input))
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),

From ce49ba9d18315a7679fdeead6d62fbb5d4ed8cc3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:11:11 +0000
Subject: [PATCH 040/323] test

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 9ec4d2923b8..69c7871c052 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -61,7 +61,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    print("type carried_input: ", type(carried_input))
+    print("type carried_input: ", carried_input.type)
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),

From 58967a88e710dac2ac9efb58a313906088573b9e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:13:15 +0000
Subject: [PATCH 041/323] test

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 69c7871c052..f9cae9f73db 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -62,6 +62,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   for carried_input in carried_inputs:
     device = carried_input.device
     print("type carried_input: ", carried_input.type)
+    print("is torch.int32: ", carried_input.type==torch.int32)
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),

From 9b0d8e86863aa0a6063a082a5b133e9ae746f9dc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:14:58 +0000
Subject: [PATCH 042/323] test

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index f9cae9f73db..e72b3791da4 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -61,8 +61,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    print("type carried_input: ", carried_input.type)
-    print("is torch.int32: ", carried_input.type==torch.int32)
+    print("type carried_input: ", carried_input.dtype)
+    print("is torch.int32: ", carried_input.dtype==torch.int32)
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),

From 79261794f9915cd3a73be18c3190583af6f80ce2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:15:59 +0000
Subject: [PATCH 043/323] test

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index e72b3791da4..dc93999b5f1 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -61,8 +61,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    print("type carried_input: ", carried_input.dtype)
-    print("is torch.int32: ", carried_input.dtype==torch.int32)
+    # print("type carried_input: ", carried_input.dtype)
+    # print("is torch.int32: ", carried_input.dtype==torch.int32)
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),

From b9e2be1ee37528f7936bafe598606e3e6c8884c6 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 4 Apr 2024 01:18:34 +0000
Subject: [PATCH 044/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index a83824e856b..b84a6ecac78 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -926,7 +926,7 @@ class PyLoweringContext {
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {10});
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
       xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "FinalOneTensor");
     }

From cb41d9b75374daafb1b53b840c7fc0e97d2743f0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 5 Apr 2024 05:58:00 +0000
Subject: [PATCH 045/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 26 +++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index b84a6ecac78..4d2f0e905dd 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -966,6 +966,32 @@ class PyLoweringContext {
                                       "UnusedArgumentsPlaceholder");
         parameter_idx += 1;
       }
+      // hard-code to meet requirement by change cond xlacomputation
+      // f32[20], /*index=5*/f32[20,10], s32[10]
+      // parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+                                      "BiasTensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+                                      "WeightTensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+      xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
+                                      "LInITensor");
+      parameters_number_i = parameters_number_i + 1;
+      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+                                      "LOutTensor");
+    }
+
+    if (GetNameString() == "bodyctx") {
+      xla::XlaBuilder* local_builder = lowering_ctx.builder();
+      int64_t parameters_number_i = 7;
+      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+                                      "WeightTensor");
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From 2695cfda8252561f338a9cf7647da10395a50b14 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 10 Apr 2024 19:57:28 +0000
Subject: [PATCH 046/323] test

---
 test/test_fori_loop_simple_linear_model_test_code.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
index 006b602f001..39ffffccf41 100644
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -49,6 +49,8 @@ def body_fun(y, x, l_in_i):
 # TODO(@manfei), need to create new variable to seperate old/formal HLO/IR
 l_in_0 = torch.randn(10, device=xm.xla_device())
 
+print("body_fun.weight: ", body_fun.weight)
+print("body_fun.weight_: ", body_fun.weight_)
 # def body_fun(x, y, l_in):
 #   # l_in = torch.randn(10, device=xm.xla_device())
 #   linear = torch.nn.Linear(10, 20).to(xm.xla_device())

From 7dd5843241cef4c28c9a7abe5b2b79386fd7c772 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 10 Apr 2024 22:38:47 +0000
Subject: [PATCH 047/323] test

---
 test/test_fori_loop_simple_linear_model_test_code.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
index 39ffffccf41..ce03fccdbb8 100644
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -24,8 +24,10 @@
 
 # --- while test case ---
 
-lower = torch.tensor([2], dtype=torch.int32, device=device)
-upper = torch.tensor([52], dtype=torch.int32, device=device)
+# lower = torch.tensor([2], dtype=torch.int32, device=device)
+# upper = torch.tensor([52], dtype=torch.int32, device=device)
+lower = torch.tensor([52], dtype=torch.int32, device=device)
+upper = torch.tensor([2], dtype=torch.int32, device=device)
 one_value = torch.tensor([1], dtype=torch.int32, device=device)
 init_val = torch.tensor([1], dtype=torch.int32, device=device)
 # one_one = torch.one(1, dtype=torch.int32, device=device)

From 535797e186f81161fce08415db1b198886e01b91 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 04:33:51 +0000
Subject: [PATCH 048/323] test

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index dc93999b5f1..4035e207f11 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -94,9 +94,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  body_hlo_print = xb.get_computation_hlo(body_computation)
-  print("body computation: !!!!!!!!!")
-  print(body_hlo_print)
+  # body_hlo_print = xb.get_computation_hlo(body_computation)
+  # print("body computation: !!!!!!!!!")
+  # print(body_hlo_print)
 
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}

From 99709006756c3d553bbe85b349d871dc32a57287 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 05:10:37 +0000
Subject: [PATCH 049/323] test

---
 test/test_fori_loop_simple_linear_model_test_code.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
index ce03fccdbb8..066eb34e91e 100644
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -20,7 +20,11 @@
 # l_in = torch.randn(10, device=xm.xla_device())
 # linear = torch.nn.Linear(10, 20).to(xm.xla_device())
 # l_out = linear(l_in)
+<<<<<<< HEAD
 # print("linear one: ", l_out)
+=======
+# print("$$$ linear one: ", l_out)
+>>>>>>> test
 
 # --- while test case ---
 

From f3a9df20e2d2034f2b67340144265fd626e2d5ba Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 07:16:58 +0000
Subject: [PATCH 050/323] test

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4035e207f11..78afdb10802 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -72,7 +72,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
-  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
+  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list = list(fake_carried_inputs[2:])
@@ -87,7 +87,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   print(cond_hlo_print)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
+  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   body_ctx.buildforiloop(list(body_result), [])

From a671982d582dfd9e8da5717e3e43a208f40435ef Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 20:54:56 +0000
Subject: [PATCH 051/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 4d2f0e905dd..8e1a800b316 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -988,7 +988,7 @@ class PyLoweringContext {
 
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 7;
+      int64_t parameters_number_i = 6; // 7;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");

From d143181dd8df7c950a6545553f0fd0df92f11f3f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 21:12:56 +0000
Subject: [PATCH 052/323] test

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 8e1a800b316..4d2f0e905dd 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -988,7 +988,7 @@ class PyLoweringContext {
 
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 6; // 7;
+      int64_t parameters_number_i = 7;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");

From 7128f70fe0d49eb9a2b8f6fa29f3ffb90ab63f34 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 21:42:25 +0000
Subject: [PATCH 053/323] test

---
 test/test_fori_loop_simple_linear_model_test_code.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
index 066eb34e91e..58352e5c548 100644
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -12,7 +12,7 @@
 # import torch_xla.core.xla_builder as xb
 import torch_xla.utils.utils as xu
 
-torch.set_grad_enabled(False)
+# torch.set_grad_enabled(False)
 
 device = xm.xla_device()
 

From b993c091ca9e6ff27f2512c39b83a77ed02d0f71 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 21:52:29 +0000
Subject: [PATCH 054/323] test

---
 ...oop_with_while_loop_simple_add_dispatch_in_torch.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 55a02a55e48..5f8d7ec01b5 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -20,9 +20,13 @@ def _fake_while_loop(cond_fn, body_fn, operands):
 
 
 def _fake_fori_loop(lower, upper, body_fun, *init_val):
-  (a, b) = init_val
-  for i in range((upper - lower)[0]):
-    a = body_fun(a, b)
+  if len(init_val) > 1:
+    (a, b) = init_val
+    for i in range((upper - lower)[0]):
+      a = body_fun(a, b)
+  else:
+    for i in range((upper - lower)[0]):
+      a = body_fun(*init_val)
   return a
 
 def _fake_fori_loop(lower, upper, body_fun, *init_val):

From 99d2d78fb7fd001498357b080cdb4e92e6dad591 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Thu, 11 Apr 2024 21:54:19 +0000
Subject: [PATCH 055/323] test

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 5f8d7ec01b5..e1a06b05d7d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -95,6 +95,7 @@ def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()
     device = xm.xla_device()
+    torch.set_grad_enabled(False)
 
     lower = torch.tensor([2], dtype=torch.int32, device=device)
     upper = torch.tensor([52], dtype=torch.int32, device=device)

From 812c072ae458fb8f4941dfa6917c47fe567d812a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:28:39 +0000
Subject: [PATCH 056/323] rebase

---
 ...fori_loop_simple_linear_model_test_code.py | 187 ++----------------
 ...while_loop_simple_add_dispatch_in_torch.py |  24 +--
 torch_xla/experimental/fori_loop.py           |  20 +-
 3 files changed, 30 insertions(+), 201 deletions(-)

diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
index 58352e5c548..07a7636d880 100644
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ b/test/test_fori_loop_simple_linear_model_test_code.py
@@ -1,15 +1,11 @@
 import os
-# import unittest
-# from typing import Callable, Dict, List
 
 import torch
 import torch_xla
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
-# from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
-# import torch_xla.core.xla_builder as xb
 import torch_xla.utils.utils as xu
 
 # torch.set_grad_enabled(False)
@@ -17,181 +13,34 @@
 device = xm.xla_device()
 
 # --- linear one ---
-# l_in = torch.randn(10, device=xm.xla_device())
-# linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-# l_out = linear(l_in)
-<<<<<<< HEAD
-# print("linear one: ", l_out)
-=======
-# print("$$$ linear one: ", l_out)
->>>>>>> test
+l_in = torch.randn(10, device=xm.xla_device())
+linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+l_out = linear(l_in)
+print("$$$ different linear model with different weight/bias: ")
+print(l_out)
 
 # --- while test case ---
-
-# lower = torch.tensor([2], dtype=torch.int32, device=device)
-# upper = torch.tensor([52], dtype=torch.int32, device=device)
-lower = torch.tensor([52], dtype=torch.int32, device=device)
-upper = torch.tensor([2], dtype=torch.int32, device=device)
-one_value = torch.tensor([1], dtype=torch.int32, device=device)
+upper = torch.tensor([52], dtype=torch.int32, device=device)
+lower = torch.tensor([0], dtype=torch.int32, device=device)
 init_val = torch.tensor([1], dtype=torch.int32, device=device)
-# one_one = torch.one(1, dtype=torch.int32, device=device)
-
-# def body_fun(l_in):
-#   # l_in = torch.randn(10, device=xm.xla_device())
-#   linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-#   # l_out = linear(l_in)
-#   return linear(l_in) # torch.add(a, b) # [0])
 linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
 
-def body_fun(y, x, l_in_i):
-  # l_in = torch.randn(10, device=xm.xla_device())
-  # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-  l_out = linear_0(l_in_i)
-  # placeholder_func = torch.rand(size = l_out.size(), device = device)
-  # placeholder_input = torch.rand(size = l_in_i.size(), device = device)
-  # return torch.add(y, x), l_out, placeholder_func, placeholder_input # linear_0(l_in_i), linear_0, l_in_i # additional return: body and input-placeholder   # linear(l_in) # torch.add(a, b) # [0])
-  return torch.add(y, x), l_out
+# def body_fun(l_in_i):
+#   l_out = linear_0(l_in_i)
+#   return l_out
 
-# TODO(@manfei), need to create new variable to seperate old/formal HLO/IR
 l_in_0 = torch.randn(10, device=xm.xla_device())
 
-print("body_fun.weight: ", body_fun.weight)
-print("body_fun.weight_: ", body_fun.weight_)
-# def body_fun(x, y, l_in):
-#   # l_in = torch.randn(10, device=xm.xla_device())
-#   linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-#   # l_out = linear(l_in)
-#   return torch.add(x, y), linear(l_in) # linear(l_in) # torch.add(a, b) # [0])
-
-# placeholder_func = torch.rand(size = l_out.size(), device = device)
-# placeholder_input = torch.rand(size = l_in_i.size(), device = device)
-print("test code, body_fun: ", body_fun)
-
-lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val, l_in_0) # , placeholder_func, placeholder_input) # , linear_0, l_in_0)
-
-print("lower_: ", lower_)
-print("upper_: ", upper_)
-print("res_: ", res_)
-
-# --- linear two ---
-# l_in_2 = torch.randn(10, device=xm.xla_device())
-# linear_2 = torch.nn.Linear(10, 20).to(xm.xla_device())
-# l_out_2 = linear(l_in_2)
-# print("linear two: ", l_out_2)
-
-# =================================================================================
-
-# import numpy as np
-# # create dummy data for training
-# # x_values = [i for i in range(11)]
-# # x_train = np.array(x_values, dtype=np.float32)
-# # x_train = x_train.reshape(-1, 1)
-
-# # y_values = [2*i + 1 for i in x_values]
-# # y_train = np.array(y_values, dtype=np.float32)
-# # y_train = y_train.reshape(-1, 1)
-
-# batch_size = 2
-
-# train_loader = xu.SampleGenerator(
-#     data=(torch.zeros(batch_size, 1), torch.zeros(batch_size, dtype=torch.float32)),
-#     sample_count=64 // batch_size // xm.xrt_world_size())
-# test_loader = xu.SampleGenerator(
-#     data=(torch.zeros(batch_size, 1, torch.zeros(batch_size, dtype=torch.float32)),
-#     sample_count=32 // batch_size // xm.xrt_world_size())
-
-# # import torch
-# from torch.autograd import Variable
-
-# class linearRegression(torch.nn.Module):
-#     def __init__(self, inputSize, outputSize):
-#         super(linearRegression, self).__init__()
-#         self.linear = torch.nn.Linear(inputSize, outputSize).to(device)
-
-#     def forward(self, x):
-#         out = self.linear(x)
-#         return out
-
-# # --- training ---
-# inputDim = 1        # takes variable 'x' 
-# outputDim = 1       # takes variable 'y'
-# learningRate = 0.01 * xm.xrt_world_size()
-# epochs = 10 # 100
-
-# model = linearRegression(inputDim, outputDim).to(device)
-# # model = MNIST().to(device)
-# ##### For GPU #######
-# # if torch.cuda.is_available():
-# #     model.cuda()
-
-# if xr.using_pjrt():
-#   xm.broadcast_master_param(model)
-
-# criterion = torch.nn.MSELoss() 
-# optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
-
-# for epoch in range(epochs):
-#     # Converting inputs and labels to Variable
-#     # if torch.cuda.is_available():
-#     #     inputs = Variable(torch.from_numpy(x_train).cuda())
-#     #     labels = Variable(torch.from_numpy(y_train).cuda())
-#     # else:
-#     inputs = Variable(torch.from_numpy(x_train)).to(device)
-#     labels = Variable(torch.from_numpy(y_train)).to(device)
-
-#     # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
-#     optimizer.zero_grad()
-
-#     # get output from the model, given the inputs
-#     outputs = model(inputs)
-
-#     # get loss for the predicted output
-#     loss = criterion(outputs, labels)
-#     print(loss)
-#     # get gradients w.r.t to parameters
-#     loss.backward()
-
-#     # update parameters
-#     # optimizer.step()
-#     xm.optimizer_step(optimizer)
-
-#     print('epoch {}, loss {}'.format(epoch, loss.item()))
-
-# # --- while simple test case ---
-
-# # device = xm.xla_device()
-
-# lower = torch.tensor([2], dtype=torch.int32, device=device)
-# upper = torch.tensor([52], dtype=torch.int32, device=device)
-# one_value = torch.tensor([1], dtype=torch.int32, device=device)
-# init_val = torch.tensor([1], dtype=torch.int32, device=device)
-
-# def body_fun(a, b):
-#   return torch.add(a, b) # [0])
-
-# lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
-
-# print("lower_: ", lower_)
-# print("upper_: ", upper_)
-# print("res_: ", res_)
+upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(upper, lower, linear_0, init_val, l_in_0)
 
-# # --- test ---
-# for epoch in range(epochs):
-#     with torch.no_grad(): # we don't need gradients in the testing phase
-#         if torch.cuda.is_available():
-#             predicted = model(Variable(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
-#         else:
-#             predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
-#         print(epoch, "-th prediction finised") # ed result: ", predicted)
+print("$$$ fori_loop l_out_: ")
+print(l_out_)
 
-# print("do one more prediction")
-# with torch.no_grad(): # we don't need gradients in the testing phase
-#     if torch.cuda.is_available():
-#         predicted = model(Variable(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
-#     else:
-#         predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
-#     print(predicted)
-# print("finished one more prediction")
+range_num = upper - lower
+for i in range(range_num[0]):
+  l_out_expected = linear_0(l_in_0)
+print("$$$ without-fori_loop l_out_: ")
+print(l_out_expected)
 
 # # --- draw ---
 # # plt.clf()
diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e1a06b05d7d..0bc5013438d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -29,11 +29,6 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
       a = body_fun(*init_val)
   return a
 
-def _fake_fori_loop(lower, upper, body_fun, *init_val):
-  (plus_value, init_val) = init_val
-  for i in range((upper - lower)[0]):
-    plus_value, init_val = body_fun(plus_value, init_val)
-  return init_val
 
 class WhileLoopTest(unittest.TestCase):
 
@@ -91,25 +86,24 @@ def body_fun(a, b):
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     self.assertEqual(expected, res_)
 
-  def test_fori_loop_tpu_addition(self):
+  def test_fori_loop_tpu_simple_linear(self):
 
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
 
-    lower = torch.tensor([2], dtype=torch.int32, device=device)
     upper = torch.tensor([52], dtype=torch.int32, device=device)
-    plus_value = torch.tensor([1], dtype=torch.int32, device=device)
+    lower = torch.tensor([0], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
+    l_in_0 = torch.randn(10, device=xm.xla_device())
+    
+    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
 
-    def body_fun(*argus):
-      plus_value, init_val = argus
-      return plus_value, torch.add(plus_value, init_val)
-
-    _, _, _, actual = fori_loop(upper, lower, body_fun, plus_value, init_val)
-    expected = _fake_fori_loop(lower, upper, body_fun, plus_value, init_val)
-    self.assertEqual(expected, actual)
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(upper, lower, linear_0, init_val, l_in_0)
+    
+    expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
+    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 78afdb10802..57f7162bf3f 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,27 +52,22 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
-  print("carried_inputs: ", carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    # print("type carried_input: ", carried_input.dtype)
-    # print("is torch.int32: ", carried_input.dtype==torch.int32)
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs: ", fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
-  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-1])
+  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list = list(fake_carried_inputs[2:])
@@ -82,21 +77,15 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  cond_hlo_print = xb.get_computation_hlo(cond_computation)
-  print("cond computation: !!!!!!!!!")
-  print(cond_hlo_print)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-1])
+  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   body_ctx.buildforiloop(list(body_result), [])
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  # body_hlo_print = xb.get_computation_hlo(body_computation)
-  # print("body computation: !!!!!!!!!")
-  # print(body_hlo_print)
 
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
@@ -118,9 +107,6 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  hlo_print = xb.get_computation_hlo(computation)
-  print("while computation: !!!!!!!!!")
-  print(hlo_print)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From 89dd1e8d05fd4882615c64bc52e76498c49f7d47 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:32:45 +0000
Subject: [PATCH 057/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 115 ++++++++++--------------
 1 file changed, 46 insertions(+), 69 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 4d2f0e905dd..94c84213fe6 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -120,8 +120,6 @@ void PrepareToExit() {
   runtime::ComputationClient* client =
       runtime::GetComputationClientIfInitialized();
   if (client != nullptr) {
-    auto xla_device = GetDeviceOrCurrent("");
-    SetAllReduceToken(xla_device, nullptr);
     XLAGraphExecutor::Get()->WaitDeviceOps({});
   }
 }
@@ -464,13 +462,12 @@ void SyncLiveTensors(const std::string& device_str,
 }
 
 void StepMarker(const std::string& device_str,
-                const std::vector<std::string>& devices, bool wait,
-                bool reset_scope) {
+                const std::vector<std::string>& devices, bool wait) {
   tsl::profiler::TraceMe activity("StepMarker",
                                   tsl::profiler::TraceMeLevel::kInfo);
   torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
   XLAGraphExecutor::Get()->SyncLiveTensorsGraph(&device, devices, wait);
-  XLAGraphExecutor::Get()->MarkStep(device, reset_scope);
+  XLAGraphExecutor::Get()->MarkStep(device);
   bool debug_mode = runtime::sys_util::GetEnvBool("PT_XLA_DEBUG", false);
   if (TF_PREDICT_FALSE(debug_mode)) {
     std::string report = runtime::metrics::CreatePerformanceReport(
@@ -902,35 +899,7 @@ class PyLoweringContext {
       : lowering_ctx("PyLoweringContext", device) {}
 
   // Builds a HLO graph given a set of output tensors.
-  void Build(std::vector<at::Tensor> tensors,
-             std::vector<at::Tensor> input_arguments = {}) {
-    if (GetNameString() == "condctx") {
-      xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 2;
-      // for (at::Tensor input_argument : input_arguments) {
-      for (int i = 0; i < 2; i++) {
-        xla::Shape shape =
-            xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameters_number_i = parameters_number_i + 1;
-      }
-      // hard-code to meet requirement by change cond xlacomputation
-      // f32[20], /*index=5*/f32[20,10], s32[10]
-      // parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-                                      "OutPutTensor");
-      parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-                                      "WeightTensor");
-      parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-                                      "FinalOneTensor");
-    }
-
+  void Build(std::vector<at::Tensor> tensors) {
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
         GetXlaTensors(tensors, /*want_all=*/true);
@@ -957,42 +926,60 @@ class PyLoweringContext {
                      std::vector<at::Tensor> input_arguments = {}) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      // hard-code parameter_idx to 2 to skip existing upper/lower arguments
-      int64_t parameter_idx = 2;
-      for (at::Tensor input_argument : input_arguments) {
+      int64_t parameters_number_i = 2;
+      // for (at::Tensor input_argument : input_arguments) {
+      for (int i = 0; i < 2; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
                                       "UnusedArgumentsPlaceholder");
-        parameter_idx += 1;
+        parameters_number_i = parameters_number_i + 1;
       }
       // hard-code to meet requirement by change cond xlacomputation
       // f32[20], /*index=5*/f32[20,10], s32[10]
       // parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+
+      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
       xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-                                      "BiasTensor");
+                                      "LInITensor");
+      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+      //                                 "BiasTensor");
       parameters_number_i = parameters_number_i + 1;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+      //                                 "LOutTensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+      // xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
+      //                                 "LInITensor");
+      // parameters_number_i = parameters_number_i + 1;
+      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+      //                                 "LOutTensor");
       parameters_number_i = parameters_number_i + 1;
-      xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+      xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
-                                      "LInITensor");
+                                      "BiasTensor");
       parameters_number_i = parameters_number_i + 1;
       xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "LOutTensor");
+      //                                 // input_value!!!, weight_0, output_value, bias!!!
     }
 
-    if (GetNameString() == "bodyctx") {
-      xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 7;
-      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-                                      "WeightTensor");
-    }
+    // // hard-code modify body xlacomputation input arguments
+    // if (GetNameString() == "bodyctx") {
+    //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
+    //   int64_t parameters_number_i = 7;
+    //   xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+    //   xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+    //                                   "WeightTensor");
+    // }
 
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
@@ -1761,12 +1748,11 @@ void InitXlaModuleBindings(py::module m) {
   m.def(
       "_xla_step_marker",
       [](const std::string& device, const std::vector<std::string>& devices,
-         bool wait, bool reset_scope) {
+         bool wait) {
         NoGilSection nogil;
-        StepMarker(device, devices, wait, reset_scope);
+        StepMarker(device, devices, wait);
       },
-      py::arg("device") = "", py::arg("devices"), py::arg("wait") = true,
-      py::arg("reset_scope") = true);
+      py::arg("device") = "", py::arg("devices"), py::arg("wait") = true);
   m.def("_get_stablehlo",
         [](const std::vector<at::Tensor>& tensors, const std::string& device,
            const std::vector<std::string>& devices,
@@ -2389,21 +2375,12 @@ void InitXlaModuleBindings(py::module m) {
         [](at::Tensor& self, const at::Tensor& source) -> at::Tensor& {
           return XLANativeFunctions::set_(self, source);
         });
-  m.def("_xla_tpu_custom_call",
-        [](const std::vector<at::Tensor>& inputs, const std::string& payload,
-           const std::vector<std::vector<int64_t>>& output_shapes,
-           const std::vector<py::object>& output_dtypes)
-            -> std::vector<at::Tensor> {
-          std::vector<at::ScalarType> dtypes;
-          dtypes.reserve(output_dtypes.size());
-          for (auto& dtype : output_dtypes) {
-            dtypes.push_back(
-                reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
-          }
-
-          auto xtensors = tensor_methods::tpu_custom_call(
-              bridge::GetXlaTensors(inputs), payload, output_shapes, dtypes);
-          return bridge::AtenFromXlaTensors(xtensors);
+  m.def("_xla_tpu_custom_call_",
+        [](const std::vector<at::Tensor>& outputs,
+           const std::vector<at::Tensor>& inputs, const std::string& payload) {
+          auto x_outputs = bridge::GetXlaTensors(outputs);
+          return tensor_methods::tpu_custom_call_(
+              x_outputs, bridge::GetXlaTensors(inputs), payload);
         });
   m.def("_set_xla_custom_op_name_prefix",
         [](const at::Tensor& input, const std::string& op_name_prefix,

From cfbe3a6a2ae9d50eb99e66bd4f14ad745a133a12 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:34:02 +0000
Subject: [PATCH 058/323] update

---
 test/cpp/test_xla_sharding.cpp                | 16 ------
 ...fori_loop_simple_linear_model_test_code.py | 50 -------------------
 2 files changed, 66 deletions(-)
 delete mode 100644 test/test_fori_loop_simple_linear_model_test_code.py

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
index 167ffd753e7..e1f908b5c80 100644
--- a/test/cpp/test_xla_sharding.cpp
+++ b/test/cpp/test_xla_sharding.cpp
@@ -435,21 +435,5 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
           ->HasValue());
 }
 
-TEST_F(XLAShardingTest, TestForiLoopAddUnusedParameterInXlaComputation) {
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  // Build simple addition.
-  xla::XlaBuilder b("builder");
-  auto x = xla::Parameter(&b, /*parameter_number=*/0, shape, "p0");
-  // Add unused parameter before create xlacomputation
-  xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  auto zzz = xla::Parameter(&b, /*parameter_number=*/1, shape2, "p1");
-  auto y = xla::Add(x, xla::ConstantR0<float>(&b, 3));
-  xla::XlaComputation xla_computation =
-      ConsumeValue(b.Build(/*remove_dynamic_dimensions=*/false));
-
-  // Check whether the unused parameter has been included into xlacomputation
-  EXPECT_EQ(xla_computation.GetProgramShape()->parameters_size(), 2);
-}
-
 }  // namespace cpp_test
 }  // namespace torch_xla
diff --git a/test/test_fori_loop_simple_linear_model_test_code.py b/test/test_fori_loop_simple_linear_model_test_code.py
deleted file mode 100644
index 07a7636d880..00000000000
--- a/test/test_fori_loop_simple_linear_model_test_code.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-
-import torch
-import torch_xla
-# We need to import the underlying implementation function to register with the dispatcher
-import torch_xla.experimental.fori_loop
-from torch_xla.experimental.fori_loop import fori_loop
-import torch_xla.core.xla_model as xm
-import torch_xla.utils.utils as xu
-
-# torch.set_grad_enabled(False)
-
-device = xm.xla_device()
-
-# --- linear one ---
-l_in = torch.randn(10, device=xm.xla_device())
-linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-l_out = linear(l_in)
-print("$$$ different linear model with different weight/bias: ")
-print(l_out)
-
-# --- while test case ---
-upper = torch.tensor([52], dtype=torch.int32, device=device)
-lower = torch.tensor([0], dtype=torch.int32, device=device)
-init_val = torch.tensor([1], dtype=torch.int32, device=device)
-linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-
-# def body_fun(l_in_i):
-#   l_out = linear_0(l_in_i)
-#   return l_out
-
-l_in_0 = torch.randn(10, device=xm.xla_device())
-
-upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(upper, lower, linear_0, init_val, l_in_0)
-
-print("$$$ fori_loop l_out_: ")
-print(l_out_)
-
-range_num = upper - lower
-for i in range(range_num[0]):
-  l_out_expected = linear_0(l_in_0)
-print("$$$ without-fori_loop l_out_: ")
-print(l_out_expected)
-
-# # --- draw ---
-# # plt.clf()
-# # plt.plot(x_train, y_train, 'go', label='True data', alpha=0.5)
-# # plt.plot(x_train, predicted, '--', label='Predictions', alpha=0.5)
-# # plt.legend(loc='best')
-# # plt.show()
\ No newline at end of file

From 4bcfa5089c5d61b29183d081ac96c43a61143b83 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:40:04 +0000
Subject: [PATCH 059/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py |  3 +-
 torch_xla/csrc/init_python_bindings.cpp       | 94 ++++++++-----------
 2 files changed, 39 insertions(+), 58 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0bc5013438d..830b3ff7df7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -23,7 +23,8 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
   if len(init_val) > 1:
     (a, b) = init_val
     for i in range((upper - lower)[0]):
-      a = body_fun(a, b)
+      # a = body_fun(a, b)
+      a = body_fun(*init_val)
   else:
     for i in range((upper - lower)[0]):
       a = body_fun(*init_val)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 94c84213fe6..9fa9e89b191 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -120,6 +120,8 @@ void PrepareToExit() {
   runtime::ComputationClient* client =
       runtime::GetComputationClientIfInitialized();
   if (client != nullptr) {
+    auto xla_device = GetDeviceOrCurrent("");
+    SetAllReduceToken(xla_device, nullptr);
     XLAGraphExecutor::Get()->WaitDeviceOps({});
   }
 }
@@ -462,12 +464,13 @@ void SyncLiveTensors(const std::string& device_str,
 }
 
 void StepMarker(const std::string& device_str,
-                const std::vector<std::string>& devices, bool wait) {
+                const std::vector<std::string>& devices, bool wait,
+                bool reset_scope) {
   tsl::profiler::TraceMe activity("StepMarker",
                                   tsl::profiler::TraceMeLevel::kInfo);
   torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
   XLAGraphExecutor::Get()->SyncLiveTensorsGraph(&device, devices, wait);
-  XLAGraphExecutor::Get()->MarkStep(device);
+  XLAGraphExecutor::Get()->MarkStep(device, reset_scope);
   bool debug_mode = runtime::sys_util::GetEnvBool("PT_XLA_DEBUG", false);
   if (TF_PREDICT_FALSE(debug_mode)) {
     std::string report = runtime::metrics::CreatePerformanceReport(
@@ -689,19 +692,6 @@ std::vector<at::Tensor> XlaUserComputation(
 
 runtime::ComputationClient::ComputationPtr CreateComputation(
     const std::string& name, xla::XlaOp root) {
-  xla::XlaBuilder* local_builder = root.builder();
-  // int64_t parameters_number_i = 4;
-  // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-  // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-  //                                     "OutPutTensor");
-  // parameters_number_i = parameters_number_i + 1;
-  // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-  // xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-  //                                     "WeightTensor");
-  // parameters_number_i = parameters_number_i + 1;
-  // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-  // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-  //                                     "FinalOneTensor");
   xla::XlaComputation computation = ConsumeValue(root.builder()->Build(root));
   return std::make_shared<runtime::ComputationClient::Computation>(
       name, std::move(computation));
@@ -926,41 +916,22 @@ class PyLoweringContext {
                      std::vector<at::Tensor> input_arguments = {}) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 2;
-      // for (at::Tensor input_argument : input_arguments) {
+      // hard-code parameter_idx to 2 to skip existing upper/lower arguments
+      int64_t parameter_idx = 2;
       for (int i = 0; i < 2; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameters_number_i, shape,
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
                                       "UnusedArgumentsPlaceholder");
-        parameters_number_i = parameters_number_i + 1;
+        parameter_idx += 1;
       }
-      // hard-code to meet requirement by change cond xlacomputation
-      // f32[20], /*index=5*/f32[20,10], s32[10]
-      // parameters_number_i = parameters_number_i + 1;
-
       xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
       xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
                                       "LInITensor");
-      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
-      //                                 "BiasTensor");
       parameters_number_i = parameters_number_i + 1;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
       xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
                                       "WeightTensor");
-      // parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-      //                                 "LOutTensor");
-      // parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-      // xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
-      //                                 "LInITensor");
-      // parameters_number_i = parameters_number_i + 1;
-      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
-      //                                 "LOutTensor");
       parameters_number_i = parameters_number_i + 1;
       xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
@@ -969,17 +940,16 @@ class PyLoweringContext {
       xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
       xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
                                       "LOutTensor");
-      //                                 // input_value!!!, weight_0, output_value, bias!!!
     }
 
-    // // hard-code modify body xlacomputation input arguments
-    // if (GetNameString() == "bodyctx") {
-    //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
-    //   int64_t parameters_number_i = 7;
-    //   xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-    //   xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
-    //                                   "WeightTensor");
-    // }
+    // hard-code modify body xlacomputation input arguments
+    if (GetNameString() == "bodyctx") {
+      xla::XlaBuilder* local_builder = lowering_ctx.builder();
+      int64_t parameters_number_i = 7;
+      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+                                      "WeightTensor");
+    }
 
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
@@ -1748,11 +1718,12 @@ void InitXlaModuleBindings(py::module m) {
   m.def(
       "_xla_step_marker",
       [](const std::string& device, const std::vector<std::string>& devices,
-         bool wait) {
+         bool wait, bool reset_scope) {
         NoGilSection nogil;
-        StepMarker(device, devices, wait);
+        StepMarker(device, devices, wait, reset_scope);
       },
-      py::arg("device") = "", py::arg("devices"), py::arg("wait") = true);
+      py::arg("device") = "", py::arg("devices"), py::arg("wait") = true,
+      py::arg("reset_scope") = true);
   m.def("_get_stablehlo",
         [](const std::vector<at::Tensor>& tensors, const std::string& device,
            const std::vector<std::string>& devices,
@@ -2375,12 +2346,21 @@ void InitXlaModuleBindings(py::module m) {
         [](at::Tensor& self, const at::Tensor& source) -> at::Tensor& {
           return XLANativeFunctions::set_(self, source);
         });
-  m.def("_xla_tpu_custom_call_",
-        [](const std::vector<at::Tensor>& outputs,
-           const std::vector<at::Tensor>& inputs, const std::string& payload) {
-          auto x_outputs = bridge::GetXlaTensors(outputs);
-          return tensor_methods::tpu_custom_call_(
-              x_outputs, bridge::GetXlaTensors(inputs), payload);
+  m.def("_xla_tpu_custom_call",
+        [](const std::vector<at::Tensor>& inputs, const std::string& payload,
+           const std::vector<std::vector<int64_t>>& output_shapes,
+           const std::vector<py::object>& output_dtypes)
+            -> std::vector<at::Tensor> {
+          std::vector<at::ScalarType> dtypes;
+          dtypes.reserve(output_dtypes.size());
+          for (auto& dtype : output_dtypes) {
+            dtypes.push_back(
+                reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
+          }
+
+          auto xtensors = tensor_methods::tpu_custom_call(
+              bridge::GetXlaTensors(inputs), payload, output_shapes, dtypes);
+          return bridge::AtenFromXlaTensors(xtensors);
         });
   m.def("_set_xla_custom_op_name_prefix",
         [](const at::Tensor& input, const std::string& op_name_prefix,
@@ -2651,4 +2631,4 @@ void InitXlaBindings(py::module m) { InitXlaModuleBindings(m); }
 
 }  // namespace torch_xla
 
-PYBIND11_MODULE(_XLAC, m) { torch_xla::InitXlaBindings(m); }
+PYBIND11_MODULE(_XLAC, m) { torch_xla::InitXlaBindings(m); }
\ No newline at end of file

From 09eaa3786a743bc068f8d582dd5ae13279d81ffa Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:41:51 +0000
Subject: [PATCH 060/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 9fa9e89b191..1fc9ac3ccdb 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -926,28 +926,28 @@ class PyLoweringContext {
         parameter_idx += 1;
       }
       xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-      xla::XlaOp x1 = xla::Parameter(local_builder, parameters_number_i, shape1,
+      xla::XlaOp x1 = xla::Parameter(local_builder, parameter_idx, shape1,
                                       "LInITensor");
-      parameters_number_i = parameters_number_i + 1;
+      parameter_idx = parameter_idx + 1;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
                                       "WeightTensor");
-      parameters_number_i = parameters_number_i + 1;
+      parameter_idx = parameter_idx + 1;
       xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x4 = xla::Parameter(local_builder, parameters_number_i, shape4,
+      xla::XlaOp x4 = xla::Parameter(local_builder, parameter_idx, shape4,
                                       "BiasTensor");
-      parameters_number_i = parameters_number_i + 1;
+      parameter_idx = parameter_idx + 1;
       xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x3 = xla::Parameter(local_builder, parameters_number_i, shape3,
+      xla::XlaOp x3 = xla::Parameter(local_builder, parameter_idx, shape3,
                                       "LOutTensor");
     }
 
     // hard-code modify body xlacomputation input arguments
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameters_number_i = 7;
+      int64_t parameter_idx = 7;
       xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameters_number_i, shape2,
+      xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
                                       "WeightTensor");
     }
 

From 88bdb4a6241759dbdcd15341e43b496a690c3206 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 00:51:38 +0000
Subject: [PATCH 061/323] update

---
 ..._while_loop_simple_add_dispatch_in_torch.py | 18 ++++++++++++++++++
 torch_xla/experimental/fori_loop.py            |  3 ---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 830b3ff7df7..e84adbaa194 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -69,6 +69,24 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+  def test_while_loop_tpu_subtraction_nested(self):
+
+    device = xm.xla_device()
+
+    def cond_fn(init, limit_value):
+      return limit_value[0] <= init[0]
+
+    def body_fn(init, limit_value):
+      one_value = torch.ones(1, dtype=torch.int32, device=device)
+      two_value = limit_value.clone()
+      return (torch.sub(torch.sub(init, one_value), one_value), two_value)
+
+    init = torch.tensor([10], dtype=torch.int32, device=device)
+    limit_value = torch.tensor([0], dtype=torch.int32, device=device)
+    res = while_loop(cond_fn, body_fn, (init, limit_value))
+    expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
+    self.assertEqual(expected, res)
+
   def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 57f7162bf3f..8141d777a4e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -16,9 +16,6 @@
 def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
-  # weight_0 = body_fun.weight
-  # bias_0 = body_fun.bias
-  # one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
   def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0): # , bias_0): # , output_value):
     return lower[0] < upper[0]

From ec5e999fe4689a7cad750c1473e15a92c898333a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 06:00:18 +0000
Subject: [PATCH 062/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 1fc9ac3ccdb..9a7cd811cb3 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -913,11 +913,29 @@ class PyLoweringContext {
   // Builds a HLO graph given a set of output tensors, and add unused parameters
   // needed in xlacomputation.
   void BuildForiLoop(std::vector<at::Tensor> tensors,
-                     std::vector<at::Tensor> input_arguments = {}) {
+                     std::vector<at::Tensor> additional_inputs_list = {}) {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       // hard-code parameter_idx to 2 to skip existing upper/lower arguments
-      int64_t parameter_idx = 2;
+      // !!! since cond_fn only compare upper and lower, so it would only use two arguments, due to PyTorch/XLA
+      // !!! trace xlacomputation from result tensor, so all the other arguments would not be included or generated;
+      // !!! but to meet xla::while requirement, we would skip first two arguments,
+      // !!! then add all other arguments like body_fn/init
+      // !!! --- additional_inputs_list: this list include all other arguments like body_fn/init except upper and lower
+      // !!! --- next step: we add dump paras according to additional_inputs_list
+      // ??? --- could we get IRvalue of `additional_inputs_list` in this function to complete xlacomputation?
+      int64_t parameter_idx = 2; // parameter_idx start from 2 after upper and lower
+      // ? type, ? shape,
+      // for (int i = 0; i < additional_inputs_list.size(); i++) {
+      for (auto& additional_input_tensor : additional_inputs_list) {
+        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+        xla::Shape shape = xtensor->shape().get().ToString();
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameter_idx += 1;
+        // xtensor->shape().get().ToString()
+        // xla_tensor->shaped_buffer().on_device_shape();
+      }
       for (int i = 0; i < 2; i++) {
         xla::Shape shape =
             xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});

From e870e70ad8d4ce055e3c8d2ec77333e736d3c09e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 06:04:23 +0000
Subject: [PATCH 063/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 9a7cd811cb3..53b55e04601 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -929,7 +929,7 @@ class PyLoweringContext {
       // for (int i = 0; i < additional_inputs_list.size(); i++) {
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-        xla::Shape shape = xtensor->shape().get().ToString();
+        xla::Shape shape = xtensor->shape().get(); // .ToString();
         xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
                                       "UnusedArgumentsPlaceholder");
         parameter_idx += 1;

From 349ed260f87d14b6199fd732fde90797701ce0b3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 06:07:34 +0000
Subject: [PATCH 064/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 44 ++++++++++++-------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 53b55e04601..e1df6657574 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -936,28 +936,28 @@ class PyLoweringContext {
         // xtensor->shape().get().ToString()
         // xla_tensor->shaped_buffer().on_device_shape();
       }
-      for (int i = 0; i < 2; i++) {
-        xla::Shape shape =
-            xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameter_idx += 1;
-      }
-      xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-      xla::XlaOp x1 = xla::Parameter(local_builder, parameter_idx, shape1,
-                                      "LInITensor");
-      parameter_idx = parameter_idx + 1;
-      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
-                                      "WeightTensor");
-      parameter_idx = parameter_idx + 1;
-      xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x4 = xla::Parameter(local_builder, parameter_idx, shape4,
-                                      "BiasTensor");
-      parameter_idx = parameter_idx + 1;
-      xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x3 = xla::Parameter(local_builder, parameter_idx, shape3,
-                                      "LOutTensor");
+      // for (int i = 0; i < 2; i++) {
+      //   xla::Shape shape =
+      //       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
+      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+      //                                 "UnusedArgumentsPlaceholder");
+      //   parameter_idx += 1;
+      // }
+      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
+      // xla::XlaOp x1 = xla::Parameter(local_builder, parameter_idx, shape1,
+      //                                 "LInITensor");
+      // parameter_idx = parameter_idx + 1;
+      // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
+      // xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
+      //                                 "WeightTensor");
+      // parameter_idx = parameter_idx + 1;
+      // xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x4 = xla::Parameter(local_builder, parameter_idx, shape4,
+      //                                 "BiasTensor");
+      // parameter_idx = parameter_idx + 1;
+      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
+      // xla::XlaOp x3 = xla::Parameter(local_builder, parameter_idx, shape3,
+      //                                 "LOutTensor");
     }
 
     // hard-code modify body xlacomputation input arguments

From 876298a657554dbb76c2f38ade7ad6426db6b932 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:22:56 +0000
Subject: [PATCH 065/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e84adbaa194..c2ed1669e64 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -126,4 +126,4 @@ def test_fori_loop_tpu_simple_linear(self):
 
 if __name__ == '__main__':
   test = unittest.main()
-  sys.exit(0 if test.result.wasSuccessful() else 1)
\ No newline at end of file
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 8141d777a4e..753c68ba455 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -69,7 +69,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   cond_ctx.set_name_string("condctx")
   additional_inputs_list = list(fake_carried_inputs[2:])
   for i in range(len(additional_inputs)):
-    additional_inputs_list.append(additional_inputs[0])
+    additional_inputs_list.append(additional_inputs[i])
   cond_ctx.buildforiloop([cond_result], additional_inputs_list)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",

From b73dc9629bfc4fcee805eff96ec3c014a1b7501e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:30:33 +0000
Subject: [PATCH 066/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 40 ++++++-------------------
 torch_xla/experimental/fori_loop.py     |  9 ++++--
 2 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index e1df6657574..27dd7ac3bdf 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -925,48 +925,26 @@ class PyLoweringContext {
       // !!! --- next step: we add dump paras according to additional_inputs_list
       // ??? --- could we get IRvalue of `additional_inputs_list` in this function to complete xlacomputation?
       int64_t parameter_idx = 2; // parameter_idx start from 2 after upper and lower
-      // ? type, ? shape,
-      // for (int i = 0; i < additional_inputs_list.size(); i++) {
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-        xla::Shape shape = xtensor->shape().get(); // .ToString();
+        xla::Shape shape = xtensor->shape().get();
         xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
                                       "UnusedArgumentsPlaceholder");
         parameter_idx += 1;
-        // xtensor->shape().get().ToString()
-        // xla_tensor->shaped_buffer().on_device_shape();
       }
-      // for (int i = 0; i < 2; i++) {
-      //   xla::Shape shape =
-      //       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, {1});
-      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-      //                                 "UnusedArgumentsPlaceholder");
-      //   parameter_idx += 1;
-      // }
-      // xla::Shape shape1 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {10});
-      // xla::XlaOp x1 = xla::Parameter(local_builder, parameter_idx, shape1,
-      //                                 "LInITensor");
-      // parameter_idx = parameter_idx + 1;
-      // xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20, 10});
-      // xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
-      //                                 "WeightTensor");
-      // parameter_idx = parameter_idx + 1;
-      // xla::Shape shape4 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x4 = xla::Parameter(local_builder, parameter_idx, shape4,
-      //                                 "BiasTensor");
-      // parameter_idx = parameter_idx + 1;
-      // xla::Shape shape3 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      // xla::XlaOp x3 = xla::Parameter(local_builder, parameter_idx, shape3,
-      //                                 "LOutTensor");
     }
 
     // hard-code modify body xlacomputation input arguments
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameter_idx = 7;
-      xla::Shape shape2 = xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {20});
-      xla::XlaOp x2 = xla::Parameter(local_builder, parameter_idx, shape2,
-                                      "WeightTensor");
+      int64_t parameter_idx = tensors.size();
+      for (auto& additional_input_tensor : additional_inputs_list) {
+        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+        xla::Shape shape = xtensor->shape().get();
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameter_idx += 1;
+      }
     }
 
     // Get the backing XLA tensors from the output torch tensor handles
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 753c68ba455..88fa371a799 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -67,7 +67,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
-  additional_inputs_list = list(fake_carried_inputs[2:])
+  additional_inputs_list = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  # treat and pass additional_inputs to cond_fn
   for i in range(len(additional_inputs)):
     additional_inputs_list.append(additional_inputs[i])
   cond_ctx.buildforiloop([cond_result], additional_inputs_list)
@@ -79,7 +80,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  body_ctx.buildforiloop(list(body_result), [])
+  additional_inputs_list = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
+  # TODO(@manfei): treat and pass additional_inputs to body_fn too
+  for i in range(len(additional_inputs)):
+    additional_inputs_list.append(additional_inputs[i])
+  body_ctx.buildforiloop(list(body_result), additional_inputs_list)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)

From b5accda1bbe8e741e0334a5fa8e650848aa87e9e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:39:07 +0000
Subject: [PATCH 067/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 36 ++++++++++++++++---------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 27dd7ac3bdf..f90a5c11b1e 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -934,18 +934,18 @@ class PyLoweringContext {
       }
     }
 
-    // hard-code modify body xlacomputation input arguments
-    if (GetNameString() == "bodyctx") {
-      xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameter_idx = tensors.size();
-      for (auto& additional_input_tensor : additional_inputs_list) {
-        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-        xla::Shape shape = xtensor->shape().get();
-        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameter_idx += 1;
-      }
-    }
+    // // hard-code modify body xlacomputation input arguments
+    // if (GetNameString() == "bodyctx") {
+    //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
+    //   int64_t parameter_idx = tensors.size();
+    //   for (auto& additional_input_tensor : additional_inputs_list) {
+    //     XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+    //     xla::Shape shape = xtensor->shape().get();
+    //     xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+    //                                   "UnusedArgumentsPlaceholder");
+    //     parameter_idx += 1;
+    //   }
+    // }
 
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
@@ -972,6 +972,18 @@ class PyLoweringContext {
       std::vector<size_t> buffer_donor_indices;
       xla::ProgramShape program_shape =
           ConsumeValue(computation.GetProgramShape());
+      // hard-code modify body xlacomputation input arguments
+      if (GetNameString() == "bodyctx") {
+        xla::XlaBuilder* local_builder = lowering_ctx.builder();
+        int64_t parameter_idx = program_shape.parameters_size(); // tensors.size();
+        for (auto& additional_input_tensor : additional_inputs_list) {
+          XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+          xla::Shape shape = xtensor->shape().get();
+          xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+          parameter_idx += 1;
+        }
+      }
       // TODO(@manfei): please confirm whether we check for more than two or use
       // default value true
       bool should_wrap_parameter = (program_shape.parameters_size() >= 2);

From 8a1f6ad506a41fadd244786bfbbc57aaad37f82c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:42:25 +0000
Subject: [PATCH 068/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 48 ++++++++++++-------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index f90a5c11b1e..7ce1ed45075 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -934,18 +934,18 @@ class PyLoweringContext {
       }
     }
 
-    // // hard-code modify body xlacomputation input arguments
-    // if (GetNameString() == "bodyctx") {
-    //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
-    //   int64_t parameter_idx = tensors.size();
-    //   for (auto& additional_input_tensor : additional_inputs_list) {
-    //     XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-    //     xla::Shape shape = xtensor->shape().get();
-    //     xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-    //                                   "UnusedArgumentsPlaceholder");
-    //     parameter_idx += 1;
-    //   }
-    // }
+    // hard-code modify body xlacomputation input arguments
+    if (GetNameString() == "bodyctx") {
+      xla::XlaBuilder* local_builder = lowering_ctx.builder();
+      int64_t parameter_idx = 7; // tensors.size();
+      for (auto& additional_input_tensor : additional_inputs_list) {
+        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+        xla::Shape shape = xtensor->shape().get();
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameter_idx += 1;
+      }
+    }
 
     // Get the backing XLA tensors from the output torch tensor handles
     std::vector<XLATensorPtr> xtensors =
@@ -972,18 +972,18 @@ class PyLoweringContext {
       std::vector<size_t> buffer_donor_indices;
       xla::ProgramShape program_shape =
           ConsumeValue(computation.GetProgramShape());
-      // hard-code modify body xlacomputation input arguments
-      if (GetNameString() == "bodyctx") {
-        xla::XlaBuilder* local_builder = lowering_ctx.builder();
-        int64_t parameter_idx = program_shape.parameters_size(); // tensors.size();
-        for (auto& additional_input_tensor : additional_inputs_list) {
-          XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-          xla::Shape shape = xtensor->shape().get();
-          xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-                                      "UnusedArgumentsPlaceholder");
-          parameter_idx += 1;
-        }
-      }
+      // // hard-code modify body xlacomputation input arguments
+      // if (GetNameString() == "bodyctx") {
+      //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
+      //   int64_t parameter_idx = program_shape.parameters_size(); // tensors.size();
+      //   for (auto& additional_input_tensor : additional_inputs_list) {
+      //     XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+      //     xla::Shape shape = xtensor->shape().get();
+      //     xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+      //                                 "UnusedArgumentsPlaceholder");
+      //     parameter_idx += 1;
+      //   }
+      // }
       // TODO(@manfei): please confirm whether we check for more than two or use
       // default value true
       bool should_wrap_parameter = (program_shape.parameters_size() >= 2);

From 7914abf109fe9399f59d58290f6277c88de24e06 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:43:25 +0000
Subject: [PATCH 069/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 7ce1ed45075..145739e1d1a 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -973,6 +973,8 @@ class PyLoweringContext {
       xla::ProgramShape program_shape =
           ConsumeValue(computation.GetProgramShape());
       // // hard-code modify body xlacomputation input arguments
+      // // xxx: failed due to not change body_xlacomputation, might becase has been traced
+      // // xxx: after `computation = ConsumeValue(lowering_ctx.BuildXla());`
       // if (GetNameString() == "bodyctx") {
       //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
       //   int64_t parameter_idx = program_shape.parameters_size(); // tensors.size();

From 3da0e431daf640f22c69dd17bf26c217febbef14 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:46:08 +0000
Subject: [PATCH 070/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 88fa371a799..ca1e1e67f08 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -84,6 +84,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
   for i in range(len(additional_inputs)):
     additional_inputs_list.append(additional_inputs[i])
+  print("len!!!: ", len(additional_inputs_list))
   body_ctx.buildforiloop(list(body_result), additional_inputs_list)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",

From 5c5ca2c083d847ef0e7b9f1c9a551294dae7e1a1 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:47:05 +0000
Subject: [PATCH 071/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index ca1e1e67f08..5eeb88121f0 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -82,9 +82,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
+  print("len0!!!: ", len(additional_inputs_list))
   for i in range(len(additional_inputs)):
     additional_inputs_list.append(additional_inputs[i])
   print("len!!!: ", len(additional_inputs_list))
+  print("additional_inputs_list: ", additional_inputs_list)
   body_ctx.buildforiloop(list(body_result), additional_inputs_list)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",

From 1c9e92fb47e9ef615ac8f907af1e6035e51a89b4 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:51:05 +0000
Subject: [PATCH 072/323] update

---
 torch_xla/experimental/fori_loop.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 5eeb88121f0..3237ae58a44 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -67,11 +67,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
-  additional_inputs_list = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   # treat and pass additional_inputs to cond_fn
   for i in range(len(additional_inputs)):
-    additional_inputs_list.append(additional_inputs[i])
-  cond_ctx.buildforiloop([cond_result], additional_inputs_list)
+    additional_inputs_list_cond.append(additional_inputs[i])
+  cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
@@ -80,14 +80,14 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  additional_inputs_list = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
+  additional_inputs_list_body = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
-  print("len0!!!: ", len(additional_inputs_list))
+  print("len0!!!: ", len(additional_inputs_list_body))
   for i in range(len(additional_inputs)):
-    additional_inputs_list.append(additional_inputs[i])
-  print("len!!!: ", len(additional_inputs_list))
-  print("additional_inputs_list: ", additional_inputs_list)
-  body_ctx.buildforiloop(list(body_result), additional_inputs_list)
+    additional_inputs_list_body.append(additional_inputs[i])
+  print("len!!!: ", len(additional_inputs_list_body))
+  print("additional_inputs_list_body: ", additional_inputs_list_body)
+  body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)

From 52d1c177bd4b5e104002314dd75d99f42aa81c3c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:51:56 +0000
Subject: [PATCH 073/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 3237ae58a44..d8328176607 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -82,6 +82,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list_body = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
+  print("list(fake_carried_inputs[-2]: ", list(fake_carried_inputs[-2])
   print("len0!!!: ", len(additional_inputs_list_body))
   for i in range(len(additional_inputs)):
     additional_inputs_list_body.append(additional_inputs[i])

From c4fe1222a8f5d11f90cfa169d04a07def03e94d4 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:52:30 +0000
Subject: [PATCH 074/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d8328176607..13a4dbf51ee 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -82,7 +82,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list_body = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
-  print("list(fake_carried_inputs[-2]: ", list(fake_carried_inputs[-2])
+  print("list(fake_carried_inputs[-2]: ", list(fake_carried_inputs[-2]))
   print("len0!!!: ", len(additional_inputs_list_body))
   for i in range(len(additional_inputs)):
     additional_inputs_list_body.append(additional_inputs[i])

From 80d20034dd8e19b53c7659971d45b5a9144a29df Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:54:44 +0000
Subject: [PATCH 075/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 13a4dbf51ee..bb18c1b65cf 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -80,9 +80,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  additional_inputs_list_body = list(fake_carried_inputs[-2]) # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
+  additional_inputs_list_body = fake_carried_inputs[-2] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
-  print("list(fake_carried_inputs[-2]: ", list(fake_carried_inputs[-2]))
+  print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
   print("len0!!!: ", len(additional_inputs_list_body))
   for i in range(len(additional_inputs)):
     additional_inputs_list_body.append(additional_inputs[i])

From 71718e15f0670e273bb5d8022d94aef6d7c77be5 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 08:56:26 +0000
Subject: [PATCH 076/323] update

---
 torch_xla/experimental/fori_loop.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index bb18c1b65cf..fa50a422008 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -80,14 +80,14 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  additional_inputs_list_body = fake_carried_inputs[-2] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
+  additional_inputs_list_body = [fake_carried_inputs[-2]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
-  print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
-  print("len0!!!: ", len(additional_inputs_list_body))
+  # print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
+  # print("len0!!!: ", len(additional_inputs_list_body))
   for i in range(len(additional_inputs)):
     additional_inputs_list_body.append(additional_inputs[i])
-  print("len!!!: ", len(additional_inputs_list_body))
-  print("additional_inputs_list_body: ", additional_inputs_list_body)
+  # print("len!!!: ", len(additional_inputs_list_body))
+  # print("additional_inputs_list_body: ", additional_inputs_list_body)
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",

From e8e18f764d20a19d99eb102b72590e8c79426e75 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:17:49 +0000
Subject: [PATCH 077/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c2ed1669e64..3db6d4e93c1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -87,6 +87,40 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+  def test_while_loop_tpu_simple_linear(self):
+
+    xm.mark_step()
+    device = xm.xla_device()
+    torch.set_grad_enabled(False)
+
+    upper = torch.tensor([52], dtype=torch.int32, device=device)
+    lower = torch.tensor([0], dtype=torch.int32, device=device)
+    init_val = torch.tensor([1], dtype=torch.int32, device=device)
+    l_in_0 = torch.randn(10, device=xm.xla_device())
+    output_value = torch.zeros([20], dtype=torch.float32, device=device)
+    one_value = torch.tensor([1], dtype=torch.int32, device=device)
+
+    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    weight_0 = linear_0.weight
+    bias_0 = linear_0.bias
+
+    def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+      return lower[0] < upper[0]
+
+    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+      new_lower = torch.add(one_value, lower)
+      output_value = body_fun(*input_value)
+      weight = body_fun.weight
+      bias = body_fun.bias
+      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_=  = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+
+    expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+
+    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+
+
   def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()

From 9741e8d1f106a54ba82a0cf7f6d20b04bed1ea96 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:20:45 +0000
Subject: [PATCH 078/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3db6d4e93c1..1be3ccdd3b4 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -95,10 +95,10 @@ def test_while_loop_tpu_simple_linear(self):
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
-    init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.randn(10, device=xm.xla_device())
-    output_value = torch.zeros([20], dtype=torch.float32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
+    l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     weight_0 = linear_0.weight
@@ -114,7 +114,7 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
       bias = body_fun.bias
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_=  = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 26e30a399cbcbc3ba5ecc3db6f1828f81f2a8703 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:22:11 +0000
Subject: [PATCH 079/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1be3ccdd3b4..0b5d79383af 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,7 +114,8 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
       bias = body_fun.bias
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = 
+      while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From c6f82592acc58dd87c9ec4546d903e32effbe11f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:24:06 +0000
Subject: [PATCH 080/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 torch_xla/experimental/fori_loop.py                             | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0b5d79383af..83a345f3760 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,6 +114,7 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
       bias = body_fun.bias
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
+    print("!!! arrive here !!!")
     upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = 
       while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
 
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index fa50a422008..20f0f108035 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -44,12 +44,14 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
+  print("!!! arrive here too !!!")
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
+  print("!!! arrive here too too !!!")
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code

From 5b9378c437a4be7bdf5bba417cec81d69d099e6c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:24:29 +0000
Subject: [PATCH 081/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 83a345f3760..f0674e6b994 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,8 +115,8 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
     print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = 
-      while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(
+      cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 64088798e4f5032b21a7fd2dbcdf300d3c2d1efa Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:26:05 +0000
Subject: [PATCH 082/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index f0674e6b994..bd41135199e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -104,15 +104,15 @@ def test_while_loop_tpu_simple_linear(self):
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
-    def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+    def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       return lower[0] < upper[0]
 
-    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+    def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(*input_value)
+      output_value = body_fun(input_value)
       weight = body_fun.weight
       bias = body_fun.bias
-      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+      return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
     print("!!! arrive here !!!")
     upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(

From 7c408a93358a2c834e14fe5ac62c42184308dcab Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:27:48 +0000
Subject: [PATCH 083/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index bd41135199e..3c103deddbd 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,9 +109,9 @@ def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
 
     def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(input_value)
-      weight = body_fun.weight
-      bias = body_fun.bias
+      output_value = linear_0(input_value)
+      weight = linear_0.weight
+      bias = linear_0.bias
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
     print("!!! arrive here !!!")

From 9c6852877ba8511a6c3d2a1e592b66fbb5122b05 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:29:42 +0000
Subject: [PATCH 084/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3c103deddbd..88ff7148a97 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -116,7 +116,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
 
     print("!!! arrive here !!!")
     upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(
-      cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
+      cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 141c7043ddbbc53715f4ed54ad1f5b7347c0169e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:41:27 +0000
Subject: [PATCH 085/323] update

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 88ff7148a97..3d5945cdb00 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,9 +114,8 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
       bias = linear_0.bias
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
-    print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(
-      cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
+    # print("!!! arrive here !!!")
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From d5e5537c7a12d9698c4ade31564ac085bed03725 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:43:09 +0000
Subject: [PATCH 086/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3d5945cdb00..dd8d0701a7e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,7 +115,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
     # print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0), additional_inputs=None)
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From cfbe475113a9cde532fabf61da86052ea4aa40a0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:47:31 +0000
Subject: [PATCH 087/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index dd8d0701a7e..3d5945cdb00 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,7 +115,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
     # print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0), additional_inputs=None)
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From b11f015586b13119dbc75d99662599c40ed6256c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 20:58:28 +0000
Subject: [PATCH 088/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 38 ++++++++++++++++++-
 torch_xla/experimental/fori_loop.py           | 13 +++++--
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3d5945cdb00..e28d4cbd605 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -87,6 +87,39 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+# //////
+#     class SimpleWithLinear(torch.nn.Module):
+#         def __init__(self):
+#             super().__init__()
+#             self.linear = torch.nn.Linear(2, 2)
+#             self.register_buffer("dec", torch.tensor(1))
+
+#         def forward(self, iter, x):
+#             def cond_fn(it, x):
+#                 return it - self.dec > 0
+
+#             def body_fn(it, x):
+#                 return it - 1, self.linear(x)
+#             return while_loop(cond_fn, body_fn, (iter, x))
+
+#     class NestedWithLinear(torch.nn.Module):
+#             return while_loop(cond_fn, body_fn, (iter, x))
+
+#     nested2 = Nested()
+#     simple_with_linear = SimpleWithLinear()
+#     nested_with_linear = NestedWithLinear()
+
+#     x = torch.zeros(1)
+#     y = torch.zeros(1)
+#     z = torch.zeros(1)
+#     return {"simple": (simple, (x,)),
+#             "nested": (nested, (x, y, z)),
+#             "nested2": (nested2, (torch.tensor(2), torch.tensor(2), torch.ones(2, 2), torch.ones(2, 2))),
+#             "simple_with_mutation": (simple_with_mutation, (x,)),
+#             "simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2))),
+#             "nested_with_linear": (nested_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
+# //////
+
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
@@ -104,10 +137,11 @@ def test_while_loop_tpu_simple_linear(self):
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
-    def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
 
-    def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = linear_0(input_value)
       weight = linear_0.weight
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 20f0f108035..2bbecd42f64 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -50,7 +50,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, *additional_inputs):
   print("!!! arrive here too too !!!")
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
@@ -62,11 +62,18 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
+  # fake_carried_inputs = tuple(fake_carried_inputs)
+  for additional_input in additional_inputs:
+    device = additional_input.device
+    #TODO(@manfei) type = carried_input.type
+    fake_carried_inputs.append(
+        torch.randint(10, additional_input.size(),
+                      dtype=additional_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
-  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
+  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
@@ -79,7 +86,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
                                                       cond_hlo)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-3], output_value=fake_carried_inputs[-2], bias_0=fake_carried_inputs[-1])
+  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list_body = [fake_carried_inputs[-2]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor

From 9cd8ca054c191ca0d7334e25d80ed8e868291ed1 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:00:29 +0000
Subject: [PATCH 089/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 2bbecd42f64..d9b399d2dfb 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -50,7 +50,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, *additional_inputs):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   print("!!! arrive here too too !!!")
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]

From 89a2b34f9eb42539ddb4c046315c4bd4f9de8f3e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:01:58 +0000
Subject: [PATCH 090/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e28d4cbd605..307a2377bb7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -149,7 +149,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
 
     # print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, output_value, bias_0))
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 9e420ad35a059cf3ceea3eec2a63c79fc79a2eaf Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:05:19 +0000
Subject: [PATCH 091/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 71 ++++++++++++-------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 307a2377bb7..9264b43ba6a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -126,30 +126,53 @@ def test_while_loop_tpu_simple_linear(self):
     device = xm.xla_device()
     torch.set_grad_enabled(False)
 
-    upper = torch.tensor([52], dtype=torch.int32, device=device)
-    lower = torch.tensor([0], dtype=torch.int32, device=device)
-    one_value = torch.tensor([1], dtype=torch.int32, device=device)
-    init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
-    l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
-    output_value = torch.zeros([20], dtype=torch.float32, device=device)
-
-    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    weight_0 = linear_0.weight
-    bias_0 = linear_0.bias
-
-    # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
-    def cond_fn(upper, lower, one_value, x, input_value, output_value):
-      return lower[0] < upper[0]
-
-    def body_fn(upper, lower, one_value, x, input_value, output_value):
-      new_lower = torch.add(one_value, lower)
-      output_value = linear_0(input_value)
-      weight = linear_0.weight
-      bias = linear_0.bias
-      return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-
-    # print("!!! arrive here !!!")
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+    class SimpleWithLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+            # self.register_buffer("dec", torch.tensor(1))
+
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+                return lower[0] < upper[0]
+
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
+                new_lower = torch.add(one_value, lower)
+                output_value = linear_0(input_value)
+                weight = linear_0.weight
+                bias = linear_0.bias
+                return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+            # return while_loop(cond_fn, body_fn, (iter, x))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+
+    # xm.mark_step()
+    # device = xm.xla_device()
+    # torch.set_grad_enabled(False)
+
+    # upper = torch.tensor([52], dtype=torch.int32, device=device)
+    # lower = torch.tensor([0], dtype=torch.int32, device=device)
+    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    # init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
+    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
+
+    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    # weight_0 = linear_0.weight
+    # bias_0 = linear_0.bias
+
+    # # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    #   return lower[0] < upper[0]
+
+    # def body_fn(upper, lower, one_value, x, input_value, output_value):
+    #   new_lower = torch.add(one_value, lower)
+    #   output_value = linear_0(input_value)
+    #   weight = linear_0.weight
+    #   bias = linear_0.bias
+    #   return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+
+    # # print("!!! arrive here !!!")
+    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From a536a3eec0ff6fdedc02de789cf0baeac93df54c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:05:41 +0000
Subject: [PATCH 092/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 9264b43ba6a..8c78ee4a8e1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -174,9 +174,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # # print("!!! arrive here !!!")
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
-    expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+    # expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
-    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+    # self.assertTrue(torch.all(torch.eq(expected, l_out_)))
 
 
   def test_fori_loop_tpu_addition(self):

From b2b246f3804c435ae98962118a8909c9d70a7bd8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:09:39 +0000
Subject: [PATCH 093/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 8c78ee4a8e1..762f6960e58 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -145,6 +145,25 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
+    simple_with_linear = SimpleWithLinear()
+    upper = torch.tensor([52], dtype=torch.int32, device=device)
+    lower = torch.tensor([0], dtype=torch.int32, device=device)
+    one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
+    l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    output_value = torch.zeros([20], dtype=torch.float32, device=device)
+
+    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    weight_0 = linear_0.weight
+    bias_0 = linear_0.bias
+
+    return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+
+#     x = torch.zeros(1)
+#     y = torch.zeros(1)
+#     z = torch.zeros(1)
+#     return {"simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
+
     # xm.mark_step()
     # device = xm.xla_device()
     # torch.set_grad_enabled(False)

From 2664e23a2f5053c34cea48eb9b217f4ca1b24a79 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:12:58 +0000
Subject: [PATCH 094/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 762f6960e58..9ef4132acd1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -157,7 +157,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
+    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
 
 #     x = torch.zeros(1)
 #     y = torch.zeros(1)

From 4f28a541e421aa4fa4ffade31b4c44e4f8a190ca Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:13:31 +0000
Subject: [PATCH 095/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 9ef4132acd1..f79c801f8e7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -159,7 +159,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
+    res = simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
+    print("res: ", res)
 
 #     x = torch.zeros(1)
 #     y = torch.zeros(1)

From 5c993afa7292577a77baf9b73af80ee24428633c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:14:35 +0000
Subject: [PATCH 096/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index f79c801f8e7..37248be031a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -161,6 +161,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     res = simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
     print("res: ", res)
+    import pdb; pdb.set_trace()
 
 #     x = torch.zeros(1)
 #     y = torch.zeros(1)

From 8298841bf8455c8cd58ed2a102e3879a86695326 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:15:44 +0000
Subject: [PATCH 097/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 113 +++++++-----------
 1 file changed, 41 insertions(+), 72 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 37248be031a..bec06748415 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -87,39 +87,6 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
-# //////
-#     class SimpleWithLinear(torch.nn.Module):
-#         def __init__(self):
-#             super().__init__()
-#             self.linear = torch.nn.Linear(2, 2)
-#             self.register_buffer("dec", torch.tensor(1))
-
-#         def forward(self, iter, x):
-#             def cond_fn(it, x):
-#                 return it - self.dec > 0
-
-#             def body_fn(it, x):
-#                 return it - 1, self.linear(x)
-#             return while_loop(cond_fn, body_fn, (iter, x))
-
-#     class NestedWithLinear(torch.nn.Module):
-#             return while_loop(cond_fn, body_fn, (iter, x))
-
-#     nested2 = Nested()
-#     simple_with_linear = SimpleWithLinear()
-#     nested_with_linear = NestedWithLinear()
-
-#     x = torch.zeros(1)
-#     y = torch.zeros(1)
-#     z = torch.zeros(1)
-#     return {"simple": (simple, (x,)),
-#             "nested": (nested, (x, y, z)),
-#             "nested2": (nested2, (torch.tensor(2), torch.tensor(2), torch.ones(2, 2), torch.ones(2, 2))),
-#             "simple_with_mutation": (simple_with_mutation, (x,)),
-#             "simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2))),
-#             "nested_with_linear": (nested_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
-# //////
-
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
@@ -163,45 +130,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     print("res: ", res)
     import pdb; pdb.set_trace()
 
-#     x = torch.zeros(1)
-#     y = torch.zeros(1)
-#     z = torch.zeros(1)
-#     return {"simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
-
-    # xm.mark_step()
-    # device = xm.xla_device()
-    # torch.set_grad_enabled(False)
-
-    # upper = torch.tensor([52], dtype=torch.int32, device=device)
-    # lower = torch.tensor([0], dtype=torch.int32, device=device)
-    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
-    # init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
-    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
-    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
-
-    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    # weight_0 = linear_0.weight
-    # bias_0 = linear_0.bias
-
-    # # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
-    # def cond_fn(upper, lower, one_value, x, input_value, output_value):
-    #   return lower[0] < upper[0]
-
-    # def body_fn(upper, lower, one_value, x, input_value, output_value):
-    #   new_lower = torch.add(one_value, lower)
-    #   output_value = linear_0(input_value)
-    #   weight = linear_0.weight
-    #   bias = linear_0.bias
-    #   return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-
-    # # print("!!! arrive here !!!")
-    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-
-    # expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
-
-    # self.assertTrue(torch.all(torch.eq(expected, l_out_)))
-
-
   def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()
@@ -242,3 +170,44 @@ def test_fori_loop_tpu_simple_linear(self):
 if __name__ == '__main__':
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
+
+
+######## ---------------------------------------------------------
+
+#     x = torch.zeros(1)
+#     y = torch.zeros(1)
+#     z = torch.zeros(1)
+#     return {"simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
+
+    # xm.mark_step()
+    # device = xm.xla_device()
+    # torch.set_grad_enabled(False)
+
+    # upper = torch.tensor([52], dtype=torch.int32, device=device)
+    # lower = torch.tensor([0], dtype=torch.int32, device=device)
+    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    # init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
+    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
+
+    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    # weight_0 = linear_0.weight
+    # bias_0 = linear_0.bias
+
+    # # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    #   return lower[0] < upper[0]
+
+    # def body_fn(upper, lower, one_value, x, input_value, output_value):
+    #   new_lower = torch.add(one_value, lower)
+    #   output_value = linear_0(input_value)
+    #   weight = linear_0.weight
+    #   bias = linear_0.bias
+    #   return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+
+    # # print("!!! arrive here !!!")
+    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+
+    # expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+
+    # self.assertTrue(torch.all(torch.eq(expected, l_out_)))

From 32af47b623529f7e0c6021cf92aeb818a8716b18 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:16:23 +0000
Subject: [PATCH 098/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index bec06748415..beb31887e5c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,6 +92,7 @@ def test_while_loop_tpu_simple_linear(self):
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
+    print("start test !!!")
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):

From 448de12d7a5ec519bf9474fbc5dc1dc57e30ee10 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:16:49 +0000
Subject: [PATCH 099/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index beb31887e5c..26438e6bf1e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,7 +92,6 @@ def test_while_loop_tpu_simple_linear(self):
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
-    print("start test !!!")
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):
@@ -113,6 +112,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
+    print("start test !!!")
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From abe4c788d7ce7ca396f0a77ce2e8b0116143e2d7 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:19:36 +0000
Subject: [PATCH 100/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 26438e6bf1e..5bd827626cb 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,6 +92,7 @@ def test_while_loop_tpu_simple_linear(self):
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
+    print("start test 1 !!!")
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):
@@ -112,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
-    print("start test !!!")
+    print("start test 2 !!!")
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 1bf37d8322fa8aa19d5a635e3fa656dd574f5479 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:19:57 +0000
Subject: [PATCH 101/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 5bd827626cb..d2ad777b869 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -122,6 +122,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
+    print("start test 3 !!!")
+
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias

From 20215378510d3c65dd04d2679b088517cc0ebeba Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:20:27 +0000
Subject: [PATCH 102/323] update

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index d2ad777b869..12abf30392f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -128,8 +128,11 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
+    print("start test 4 !!!")
+
+    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    print("start test 5 !!!")
     res = simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
     print("res: ", res)
     import pdb; pdb.set_trace()

From 83c2f97b2712f9c2d7009afd98246f56e6386456 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:20:57 +0000
Subject: [PATCH 103/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 12abf30392f..596a488cf60 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -135,7 +135,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     print("start test 5 !!!")
     res = simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
     print("res: ", res)
+    print("start test 6 !!!")
     import pdb; pdb.set_trace()
+    return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
 
   def test_fori_loop_tpu_addition(self):
 

From 1945952aee79ac87e783cd2a805135741ecfc11e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:22:55 +0000
Subject: [PATCH 104/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 596a488cf60..ab480592e76 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -129,11 +129,10 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     bias_0 = linear_0.bias
 
     print("start test 4 !!!")
-
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     print("start test 5 !!!")
-    res = simple_with_linear.apply(upper, lower, one_value, init_val, l_in_0, output_value)
+    res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     print("res: ", res)
     print("start test 6 !!!")
     import pdb; pdb.set_trace()

From 6ee5400dd8f7c3c8256ff2045534406f0a2b4cc7 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:25:05 +0000
Subject: [PATCH 105/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ab480592e76..cb148b60346 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -132,9 +132,11 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     print("start test 5 !!!")
+    aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    print("aaa: ", aaa)
+    print("start test 6 !!!")
     res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     print("res: ", res)
-    print("start test 6 !!!")
     import pdb; pdb.set_trace()
     return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
 

From 3db127e8bb9b2ce33bf280c87549b1d51870812c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:25:54 +0000
Subject: [PATCH 106/323] update

---
 ..._loop_with_while_loop_simple_add_dispatch_in_torch.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index cb148b60346..be1310f914d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -135,10 +135,11 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     print("aaa: ", aaa)
     print("start test 6 !!!")
-    res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
-    print("res: ", res)
-    import pdb; pdb.set_trace()
-    return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    return aaa
+    # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
+    # print("res: ", res)
+    # import pdb; pdb.set_trace()
+    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
 
   def test_fori_loop_tpu_addition(self):
 

From dc1837d999b2a61cb1156d8b4171aa0c4c5f6362 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:30:03 +0000
Subject: [PATCH 107/323] update

---
 ...ith_while_loop_simple_add_dispatch_in_torch.py | 15 ++++++---------
 torch_xla/experimental/fori_loop.py               |  1 +
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index be1310f914d..3c33a118a75 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,7 +92,6 @@ def test_while_loop_tpu_simple_linear(self):
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
-    print("start test 1 !!!")
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):
@@ -113,7 +112,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
-    print("start test 2 !!!")
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
@@ -122,20 +120,19 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    print("start test 3 !!!")
-
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
-    print("start test 4 !!!")
-    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    print("start test 5 !!!")
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     print("aaa: ", aaa)
-    print("start test 6 !!!")
+    # print("start test 6 !!!")
     return aaa
+
+    expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+
+    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     # print("res: ", res)
     # import pdb; pdb.set_trace()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d9b399d2dfb..70b4388ab9c 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,6 +52,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   print("!!! arrive here too too !!!")
+  import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code

From ea9bf565ad47e9d7014e3c8660573de5d90f77a4 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:30:34 +0000
Subject: [PATCH 108/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3c33a118a75..563cfd39adf 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -125,7 +125,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     bias_0 = linear_0.bias
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
+    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     print("aaa: ", aaa)
     # print("start test 6 !!!")
     return aaa

From ee05a67db6108568ea831a89f5cb84fcc3dadc1d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 21:31:40 +0000
Subject: [PATCH 109/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 563cfd39adf..138e1b18b46 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -117,7 +117,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
-    l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())

From 67f26521d10af7f4229cfb531689e821ff709327 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 22:43:46 +0000
Subject: [PATCH 110/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 138e1b18b46..44948605d9c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -128,6 +128,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     print("aaa: ", aaa)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa
 

From 3bba9fd7dfff560b07df023788f427f31a775286 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 22:45:40 +0000
Subject: [PATCH 111/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 70b4388ab9c..1e6d2dd732a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -87,7 +87,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
                                                       cond_hlo)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
+  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list_body = [fake_carried_inputs[-2]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor

From 2e349a601de3dbc1bba418dc5a95e9242df98080 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 22:52:23 +0000
Subject: [PATCH 112/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 44948605d9c..b04d9dd3ef6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,10 +105,10 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
 
             def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
-                output_value = linear_0(input_value)
+                output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias
-                return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+                return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 

From 7b63eb845b858d45651eeff1334b3db32f4f8a0c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 22:53:48 +0000
Subject: [PATCH 113/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b04d9dd3ef6..8b4804d5e31 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,6 +105,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
 
             def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
+                output_value_real = output_value.copy()
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias

From 62855cc33940a6ba8df45cce62e12f27592ccc00 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 22:54:25 +0000
Subject: [PATCH 114/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 8b4804d5e31..b04d9dd3ef6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,7 +105,6 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
 
             def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
-                output_value_real = output_value.copy()
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias

From 70a29c4f7a3064155b30516dd780f8f0a40b839b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:02:23 +0000
Subject: [PATCH 115/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b04d9dd3ef6..b1ad2095c3d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -7,6 +7,7 @@
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
+from torch_xla.experimental.fori_loop import _xla_while_loop
 from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_builder as xb
@@ -110,7 +111,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 bias = linear_0.bias
                 return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+            return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)

From 863ba37a872db3f70beabefb97f9f1f42236bae8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:03:04 +0000
Subject: [PATCH 116/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 1e6d2dd732a..979be6c254d 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -50,7 +50,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   print("!!! arrive here too too !!!")
   import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop

From 4da7ede1c84c9d2c2e6fcb64d2a71599146e6d90 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:04:17 +0000
Subject: [PATCH 117/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 979be6c254d..2de46cdd938 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -50,9 +50,9 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  import pdb; pdb.set_trace()
+  # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code

From 261ec24bab6a0e6e954eecc42d3bbc423931132f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:06:06 +0000
Subject: [PATCH 118/323] update

---
 torch_xla/experimental/fori_loop.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 2de46cdd938..a11af64db3a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -64,6 +64,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
+  print("fake_carried_inputs first: ", fake_carried_inputs)
   for additional_input in additional_inputs:
     device = additional_input.device
     #TODO(@manfei) type = carried_input.type
@@ -71,10 +72,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
+  print("fake_carried_inputs second: ", fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
-  cond_result = cond_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
+  cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
@@ -87,7 +89,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
                                                       cond_hlo)
 
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs[:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
+  body_result = body_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
   additional_inputs_list_body = [fake_carried_inputs[-2]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor

From 334036ed6778841935683c208a29ad7559507add Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:09:01 +0000
Subject: [PATCH 119/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                             | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b1ad2095c3d..4e6f9585dd3 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -129,7 +129,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
-    print("aaa: ", aaa)
+    # print("aaa: ", aaa)
     bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a11af64db3a..4396d25f184 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,6 +52,8 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
+  print("carried_inputs: ", carried_inputs)
+  print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]

From acc2e205f131fc6b6e7392cbb0c0f470fe2780c0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:10:54 +0000
Subject: [PATCH 120/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 4e6f9585dd3..c3a8a05bd3b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -112,7 +112,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-            return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+            return 1
+            # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)

From 3e7172c80b1b992a31e5f02378433b7cd9f1df34 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:11:44 +0000
Subject: [PATCH 121/323] update

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c3a8a05bd3b..cc72cfa995c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -7,7 +7,7 @@
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
-from torch_xla.experimental.fori_loop import _xla_while_loop
+# from torch_xla.experimental.fori_loop import _xla_while_loop
 from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_builder as xb
@@ -112,7 +112,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-            return 1
+            # return 1
+            return upper, lower, one_value, x, input_value, output_value
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From ead598174e2180a28a4bebab7e6180195bfbdd35 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:12:54 +0000
Subject: [PATCH 122/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index cc72cfa995c..c12bcb48120 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
-            return upper, lower, one_value, x, input_value, output_value
+            # return upper, lower, one_value, x, input_value, output_value
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 4762c554542215060c0729a3dfbb714e5f02fdda Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:13:55 +0000
Subject: [PATCH 123/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c12bcb48120..0041e2f8d0a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,8 +113,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
-            # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            return upper, lower, one_value, x, input_value, output_value
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From e4ff32fadbdf067d4734a0e92abad51ddd5b9038 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:14:41 +0000
Subject: [PATCH 124/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0041e2f8d0a..5f56aaa7a9f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,7 +109,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias
-                return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
+                # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
+                return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From 7215162b92e088a584c31800646329c5baf9182a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:15:37 +0000
Subject: [PATCH 125/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 5f56aaa7a9f..46b3d8aa25f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,8 +114,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
-            return upper, lower, one_value, x, input_value, output_value
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            # return upper, lower, one_value, x, input_value, output_value
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From f275c0ee26805bb19ec2bb941d2b2f612668e200 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:16:07 +0000
Subject: [PATCH 126/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 46b3d8aa25f..5f56aaa7a9f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,8 +114,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
-            # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            return upper, lower, one_value, x, input_value, output_value
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 2958d830163c113afcd53da8a1f14278355f9faa Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:16:53 +0000
Subject: [PATCH 127/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 5f56aaa7a9f..46b3d8aa25f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -114,8 +114,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
-            return upper, lower, one_value, x, input_value, output_value
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            # return upper, lower, one_value, x, input_value, output_value
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 9db53a3cb8ea4b4120f6ceddbadbdcf4435bc594 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:35:04 +0000
Subject: [PATCH 128/323] update

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 46b3d8aa25f..db2230ad85f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,8 +109,11 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias
+                new_upper = upper
+                new_one_value = one_value
+                new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value_real
+                return new_upper, new_lower, new_one_value, torch.add(one_value, x), new_input_value, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From d35e48ef4fbf85d03ec86e31e4b855a371ee229f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:36:11 +0000
Subject: [PATCH 129/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index db2230ad85f..666ee191109 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_one_value = one_value
                 new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return new_upper, new_lower, new_one_value, torch.add(one_value, x), new_input_value, output_value_real
+                return upper.copy(), lower.copy(), one_value.copy(), torch.add(one_value, x), input_value.copy(), output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From 25ac04ed98217144733c3bc4ab3ac23b6f974f3f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:37:47 +0000
Subject: [PATCH 130/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 666ee191109..19961d30e90 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,11 +109,11 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight
                 bias = linear_0.bias
-                new_upper = upper
-                new_one_value = one_value
-                new_input_value = input_value
+                # new_upper = upper
+                # new_one_value = one_value
+                # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.copy(), lower.copy(), one_value.copy(), torch.add(one_value, x), input_value.copy(), output_value_real
+                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From 3f473ad15ad38ebd05dcf8aff1656a03c5afbf30 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:42:21 +0000
Subject: [PATCH 131/323] update

---
 torch_xla/experimental/fori_loop.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4396d25f184..cc15991f7a6 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,11 +52,13 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  print("carried_inputs: ", carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+  # print("carried_inputs: ", carried_inputs)
+  # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]
+  # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
+  additional_inputs = carried_inputs[0]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From b31e28062a05094110d824be21de677ccab89a39 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:44:07 +0000
Subject: [PATCH 132/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index cc15991f7a6..af014058009 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,8 +52,8 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  # print("carried_inputs: ", carried_inputs)
-  # print("additional_inputs: ", additional_inputs)
+  print("carried_inputs: ", carried_inputs)
+  print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = carried_inputs[0]

From 845d903893ecb04385e17652117ef0fd7795ca20 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:45:32 +0000
Subject: [PATCH 133/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index af014058009..0da353ba660 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -50,15 +50,15 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
+def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
   print("carried_inputs: ", carried_inputs)
   print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
-  carried_inputs = carried_inputs[0]
+  carried_inputs = original_carried_inputs[0]
   # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
-  additional_inputs = carried_inputs[0]
+  additional_inputs = original_carried_inputs[1]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From 13157d9b1158a7dcfb67d8e3df638ae1745cffdd Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:46:11 +0000
Subject: [PATCH 134/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0da353ba660..def0a5a0e38 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -52,8 +52,8 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  print("carried_inputs: ", carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+  # print("carried_inputs: ", carried_inputs)
+  # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = original_carried_inputs[0]

From 41d3ba6d11491e9bd09377880d84014ccab0b536 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:48:12 +0000
Subject: [PATCH 135/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 19961d30e90..d5a190279b1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 # new_one_value = one_value
                 # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real
+                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight, bias, output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index def0a5a0e38..cff55d207bb 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -68,7 +68,7 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs first: ", fake_carried_inputs)
+  # print("fake_carried_inputs first: ", fake_carried_inputs)
   for additional_input in additional_inputs:
     device = additional_input.device
     #TODO(@manfei) type = carried_input.type
@@ -76,7 +76,7 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
   fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs second: ", fake_carried_inputs)
+  # print("fake_carried_inputs second: ", fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c

From 52cd5bc9a6a7cc1bfae412b674c60561a9cf3887 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:49:08 +0000
Subject: [PATCH 136/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index d5a190279b1..ecfe7cd16f9 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 # new_one_value = one_value
                 # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight, bias, output_value_real
+                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From a3756e6ef21ef12773bcfc331016b1fdf80226d8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:50:48 +0000
Subject: [PATCH 137/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ecfe7cd16f9..bf015598af4 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -137,7 +137,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     # print("aaa: ", aaa)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 21ba83dbff3d6987c91e624523cf895da80abca9 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:52:54 +0000
Subject: [PATCH 138/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index bf015598af4..3119a992ac3 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,7 +100,7 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 

From 24f2fe3a3c89faa5ebfa86efc878d348a323c027 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:54:19 +0000
Subject: [PATCH 139/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3119a992ac3..439c79b3393 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -118,7 +118,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 4687bbeee679def69b6790fe3f6c49b2d827f9a2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:55:47 +0000
Subject: [PATCH 140/323] update

---
 torch_xla/experimental/fori_loop.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index cff55d207bb..59c925c83ed 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -69,14 +69,14 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
                       dtype=carried_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
   # print("fake_carried_inputs first: ", fake_carried_inputs)
-  for additional_input in additional_inputs:
-    device = additional_input.device
-    #TODO(@manfei) type = carried_input.type
-    fake_carried_inputs.append(
-        torch.randint(10, additional_input.size(),
-                      dtype=additional_input.dtype).to(device))
-  fake_carried_inputs = tuple(fake_carried_inputs)
-  # print("fake_carried_inputs second: ", fake_carried_inputs)
+  # for additional_input in additional_inputs:
+  #   device = additional_input.device
+  #   #TODO(@manfei) type = carried_input.type
+  #   fake_carried_inputs.append(
+  #       torch.randint(10, additional_input.size(),
+  #                     dtype=additional_input.dtype).to(device))
+  # fake_carried_inputs = tuple(fake_carried_inputs)
+  # # print("fake_carried_inputs second: ", fake_carried_inputs)
 
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c

From 8713b1c4ba79e889e00fb56eee51445f73cae70b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:56:36 +0000
Subject: [PATCH 141/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 439c79b3393..3ae1e5847ed 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,10 +101,10 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight

From 05f1c33034020b678e8fb3056773ba12ed5331bb Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Fri, 12 Apr 2024 23:57:49 +0000
Subject: [PATCH 142/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3ae1e5847ed..439c79b3393 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,10 +101,10 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight

From d96d7355bcf681b5a2ecb137dbad700f7d08a6c0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:02:52 +0000
Subject: [PATCH 143/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 439c79b3393..3ae1e5847ed 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,10 +101,10 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = linear_0(input_value)
                 weight = linear_0.weight

From e12bda67c7f8447ec259e5b6494418c05e946348 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:04:29 +0000
Subject: [PATCH 144/323] update

---
 ...fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3ae1e5847ed..608460153a4 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,7 +100,10 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
+            weight_0 = linear_0.weight
+            bias_0 = linear_0.bias
+
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 

From 8ed8f12ec102a3d61d0d6ab8914bcc9486fd2181 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:05:18 +0000
Subject: [PATCH 145/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 608460153a4..88cec10543d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,10 +100,9 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             weight_0 = linear_0.weight
             bias_0 = linear_0.bias
-
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 

From 094a4cafc2f597652106a2098e55abee9c0f5e39 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:05:43 +0000
Subject: [PATCH 146/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 88cec10543d..51a2cae47b2 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,8 +101,8 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            weight_0 = linear_0.weight
-            bias_0 = linear_0.bias
+            weight_0 = self.linear.weight
+            bias_0 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 

From 529a8c824f43a72c9f5e035cae512cff88b9c0ca Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:10:15 +0000
Subject: [PATCH 147/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 51a2cae47b2..0e17da9d940 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,8 +101,8 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            weight_0 = self.linear.weight
-            bias_0 = self.linear.bias
+            weight_1 = self.linear.weight
+            bias_1 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 return lower[0] < upper[0]
 
@@ -120,7 +120,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value), (weight_1, bias_1))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 57083aef6460ddd81f771be54abeb2fde0b1d25a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:10:59 +0000
Subject: [PATCH 148/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0e17da9d940..a579d109ed6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -108,9 +108,9 @@ def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
 
             def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
                 new_lower = torch.add(one_value, lower)
-                output_value_real = linear_0(input_value)
-                weight = linear_0.weight
-                bias = linear_0.bias
+                output_value_real = self.linear(input_value)
+                weight = self.linear.weight
+                bias = self.linear.bias
                 # new_upper = upper
                 # new_one_value = one_value
                 # new_input_value = input_value
@@ -120,7 +120,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value), (weight_1, bias_1))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 630caa9b88a38e677cce523b332821817b999172 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:12:54 +0000
Subject: [PATCH 149/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index a579d109ed6..182f5fd6985 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -132,15 +132,15 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    weight_0 = linear_0.weight
-    bias_0 = linear_0.bias
+    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    weight_0 = simple_with_linear.linear.weight
+    bias_0 = simple_with_linear.linear.bias
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
+    bbb = simple_with_linear((upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value), (weight_0, bias_0))
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 406635b973fed0212d4dd1f851fca581a2613be3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:14:34 +0000
Subject: [PATCH 150/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 182f5fd6985..0fdcc385493 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -140,7 +140,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    bbb = simple_with_linear((upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value), (weight_0, bias_0))
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value, weight_0, bias_0)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 992546501e3f896dcea1a07601b80241988d4858 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Sat, 13 Apr 2024 00:15:20 +0000
Subject: [PATCH 151/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0fdcc385493..b19e0c80243 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -140,7 +140,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value, weight_0, bias_0)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 9b1b7a72ffeb26a1ebb0044e7efb724c55bc4209 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:28:47 +0000
Subject: [PATCH 152/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b19e0c80243..2d64dcf1bab 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,7 +100,8 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):

From 3cf020071931568b7df7bc8ff249c977a64c10bd Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:31:32 +0000
Subject: [PATCH 153/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 2d64dcf1bab..40c81de463c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,8 +100,8 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
@@ -141,7 +141,8 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
+    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 1b06687e1fb551725067aa729394fdfd3831fff7 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:32:15 +0000
Subject: [PATCH 154/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 40c81de463c..f663f5fde04 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -142,7 +142,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 13b77670d9e3df788ea01d561e4fcd9d864c973a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:35:35 +0000
Subject: [PATCH 155/323] update

---
 ...th_while_loop_simple_add_dispatch_in_torch.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index f663f5fde04..63bf159f0c8 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,14 +100,16 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        # def forward(self, upper, lower, one_value, x, input_value, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight
@@ -121,7 +123,8 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
@@ -142,7 +145,8 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     # print("aaa: ", aaa)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
+    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
+    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     print("bbb: ", bbb)
     # print("start test 6 !!!")
     return aaa

From 90f6df1db6c43025994bce2bf01356370736b65d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:36:36 +0000
Subject: [PATCH 156/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 63bf159f0c8..fa69205cf75 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -124,8 +124,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
+            return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)

From 732876b7f78015781359dc148a6fef0c97734760 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:37:04 +0000
Subject: [PATCH 157/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index fa69205cf75..94a4e753f10 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -7,7 +7,7 @@
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
-# from torch_xla.experimental.fori_loop import _xla_while_loop
+from torch_xla.experimental.fori_loop import _xla_while_loop
 from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_builder as xb

From 1ec25fef85e3267f2530923e9ce9582a3d866947 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:38:11 +0000
Subject: [PATCH 158/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 +++---
 torch_xla/experimental/fori_loop.py                         | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 94a4e753f10..63bf159f0c8 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -7,7 +7,7 @@
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
-from torch_xla.experimental.fori_loop import _xla_while_loop
+# from torch_xla.experimental.fori_loop import _xla_while_loop
 from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_builder as xb
@@ -124,8 +124,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
+            # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 59c925c83ed..7d53701a444 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -58,7 +58,8 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   # untuple carried_inputs from while_loop
   carried_inputs = original_carried_inputs[0]
   # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
-  additional_inputs = original_carried_inputs[1]
+  if len(original_carried_inputs) == 2:
+    additional_inputs = original_carried_inputs[1]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From fdb7cabb2c77829c2b424dd4ec56215aac4ba4df Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:39:02 +0000
Subject: [PATCH 159/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 63bf159f0c8..1ca0d4aa8fb 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -104,12 +104,12 @@ def __init__(self):
         def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight

From 17d43eff82536499381458e7bcbea00440274fa9 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:41:07 +0000
Subject: [PATCH 160/323] update

---
 torch_xla/experimental/fori_loop.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 7d53701a444..a11697e33c5 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -79,6 +79,7 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   # fake_carried_inputs = tuple(fake_carried_inputs)
   # # print("fake_carried_inputs second: ", fake_carried_inputs)
 
+  print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
@@ -92,7 +93,9 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
+  print("!!! arrive here too after cond !!!")
 
+  print("!!! arrive here too before body !!!")
   # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
@@ -109,7 +112,9 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
+  print("!!! arrive here too after body !!!")
 
+  print("!!! arrive here too before args!!!")
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
   if type(carried_inputs) is tuple:
@@ -130,10 +135,13 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
+  print("!!! arrive here too after args!!!")
 
+  print("!!! arrive here too before while!!!")
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (carried_inputs),
                                                  computation)
+  print("!!! arrive here too after while!!!")
 
   return result
\ No newline at end of file

From 0016420a184ca374cb32ba96c568add2cf9a4df4 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:42:14 +0000
Subject: [PATCH 161/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a11697e33c5..c9decb56964 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -87,8 +87,10 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   # treat and pass additional_inputs to cond_fn
+  print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
   for i in range(len(additional_inputs)):
     additional_inputs_list_cond.append(additional_inputs[i])
+  print("additional_inputs_list_cond two: ", additional_inputs_list_cond)
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",

From 42745757d6d08f90622e21f7f8f9012b0311eddf Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:43:01 +0000
Subject: [PATCH 162/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index c9decb56964..048da5eb9a2 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -83,6 +83,7 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   # generate cond_fn xlacomputation
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
+  print("nnn here ???")
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor

From 4fd2e4a0cb6b4edd3c89bd5a2d4223138154a763 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:43:44 +0000
Subject: [PATCH 163/323] update

---
 torch_xla/experimental/fori_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 048da5eb9a2..0d89dfce829 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -81,9 +81,10 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
 
   print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation
+  print("print fake_carried_inputs: ", fake_carried_inputs)
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
-  print("nnn here ???")
+  # print("nnn here ???")
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor

From b06290031ed720bb41fe2ae8da1817faa27c45a6 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:45:29 +0000
Subject: [PATCH 164/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1ca0d4aa8fb..93eab72bc9b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,8 +100,8 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):

From c2f29737be0856f08302d13367ba8e49b68b2dc7 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:46:06 +0000
Subject: [PATCH 165/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 93eab72bc9b..1ca0d4aa8fb 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,8 +100,8 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        # def forward(self, upper, lower, one_value, x, input_value, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
             def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):

From 8dbde24acbfcd0312728b8d02a1ba7be8be5a7b3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 18:47:08 +0000
Subject: [PATCH 166/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1ca0d4aa8fb..4e93f32d48b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -104,11 +104,14 @@ def __init__(self):
         def forward(self, upper, lower, one_value, x, input_value, output_value):
             weight_1 = self.linear.weight
             bias_1 = self.linear.bias
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)

From 066047c56280cd1204a41c38377fa0eb3bd5ac7b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:19:07 +0000
Subject: [PATCH 167/323] update

---
 ..._with_while_loop_simple_add_dispatch_in_torch.py | 13 +++++++------
 torch_xla/experimental/fori_loop.py                 |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 4e93f32d48b..7d12ab6f517 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -102,16 +102,16 @@ def __init__(self):
 
         # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
         def forward(self, upper, lower, one_value, x, input_value, output_value):
-            weight_1 = self.linear.weight
-            bias_1 = self.linear.bias
+            # weight_1 = self.linear.weight
+            # bias_1 = self.linear.bias
 
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
@@ -128,6 +128,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_va
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0d89dfce829..600950f48ac 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -84,7 +84,7 @@ def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_input
   print("print fake_carried_inputs: ", fake_carried_inputs)
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
-  # print("nnn here ???")
+  print("nnn here ???")
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor

From 374d16d06cc541ff5f301764372d487aa74fa4ff Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:19:48 +0000
Subject: [PATCH 168/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 7d12ab6f517..6133d205242 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -127,8 +127,8 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
             # return 1
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 1f30e268996448e19935b8289cde2323633e3e03 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:21:34 +0000
Subject: [PATCH 169/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 6133d205242..6162c3b921f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,14 +105,14 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight

From fc85bc3e1dcf7cf80dea7714d359cf7503b2251a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:23:17 +0000
Subject: [PATCH 170/323] update

---
 torch_xla/experimental/fori_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 600950f48ac..fab9ba09c50 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -53,12 +53,13 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
 def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
   # print("carried_inputs: ", carried_inputs)
-  # print("additional_inputs: ", additional_inputs)
+  print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
   carried_inputs = original_carried_inputs[0]
   # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
   if len(original_carried_inputs) == 2:
+    print("use original_carried_inputs for additional_inputs")
     additional_inputs = original_carried_inputs[1]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []

From 6ebda73b33bb1e61279b513f67bac44f20214c45 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:24:25 +0000
Subject: [PATCH 171/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index fab9ba09c50..097f67687a3 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -45,6 +45,7 @@ def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("!!! arrive here too !!!")
+  print("while_loop additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,

From 55c2ea11432f9c0de4fcf2f6535a2abe072780e2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:30:32 +0000
Subject: [PATCH 172/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 097f67687a3..31bc893f1d9 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -40,7 +40,7 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
 
 
 @while_loop_op.py_impl(DispatchKey.XLA)
-def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=None):
+def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)

From 36d256525fc2bd140abaaec4794e154e3f0e1f81 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:38:12 +0000
Subject: [PATCH 173/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 6162c3b921f..c8276bfb16d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,7 +92,7 @@ def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
     device = xm.xla_device()
-    torch.set_grad_enabled(False)
+    # torch.set_grad_enabled(False)
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):

From 4a500c019914096ad1398b13b4403b34ab575c54 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:39:39 +0000
Subject: [PATCH 174/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c8276bfb16d..6162c3b921f 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -92,7 +92,7 @@ def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
     device = xm.xla_device()
-    # torch.set_grad_enabled(False)
+    torch.set_grad_enabled(False)
 
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):

From 3a2553e4b9335c2b74ca117cfd500d72dac9823d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:50:00 +0000
Subject: [PATCH 175/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 6162c3b921f..152606a7d6a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -128,7 +128,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+            weight_1 = self.linear.weight
+            bias_1 = self.linear.bias
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value), (bias_1, weight_1))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 0998b374e1344edb7d903be54e76ff9fb3ccb1a6 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:51:08 +0000
Subject: [PATCH 176/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 152606a7d6a..7fd99631943 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -128,9 +128,10 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return upper, lower, one_value, x, input_value, output_value
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            weight_1 = self.linear.weight
-            bias_1 = self.linear.bias
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value), (bias_1, weight_1))
+            # weight_1 = self.linear.weight
+            # bias_1 = self.linear.bias
+            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value), (bias_1, weight_1))
+            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()

From 267e8b3460fe4b2e9a3779383843ec8e19c33acc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:54:12 +0000
Subject: [PATCH 177/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 31bc893f1d9..68db30dd6fa 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -40,7 +40,7 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
 
 
 @while_loop_op.py_impl(DispatchKey.XLA)
-def while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs):
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)

From a577859eff905e076e25a06a2b0d70101901f948 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:57:08 +0000
Subject: [PATCH 178/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 68db30dd6fa..fb8f8234b16 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -45,7 +45,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("!!! arrive here too !!!")
-  print("while_loop additional_inputs: ", additional_inputs)
+  # print("while_loop additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
@@ -53,7 +53,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 
 def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  # print("carried_inputs: ", carried_inputs)
+  print("carried_inputs: ", carried_inputs)
   print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop

From a3ee72aefe21c863b4cfc052ab82a9468cc85b9e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 20:57:39 +0000
Subject: [PATCH 179/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index fb8f8234b16..8c8d47ca494 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -53,7 +53,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 
 def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  print("carried_inputs: ", carried_inputs)
+  print("original_carried_inputs: ", original_carried_inputs)
   print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop

From 293b87aa9bf7729c995345e3964b3eb311c78a04 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:00:34 +0000
Subject: [PATCH 180/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 8c8d47ca494..aceb585ab45 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -53,11 +53,11 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 
 def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
-  print("original_carried_inputs: ", original_carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+  # print("original_carried_inputs: ", original_carried_inputs)
+  # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
-  carried_inputs = original_carried_inputs[0]
+  # carried_inputs = original_carried_inputs[0] # due to PyTorch has already treat them , so skip split here
   # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
   if len(original_carried_inputs) == 2:
     print("use original_carried_inputs for additional_inputs")

From 128a3dcf6a37dd959716777fcb8edb9447556cad Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:02:13 +0000
Subject: [PATCH 181/323] update

---
 ...op_with_while_loop_simple_add_dispatch_in_torch.py |  8 ++++----
 torch_xla/experimental/fori_loop.py                   | 11 ++++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 7fd99631943..8e5eead8fc3 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,14 +105,14 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index aceb585ab45..1b874ed9db2 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -51,17 +51,18 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
   return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
 
 
-def _xla_while_loop(cond_fn, body_fn, *original_carried_inputs, additional_inputs=()):
+def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too too !!!")
   # print("original_carried_inputs: ", original_carried_inputs)
   # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
   # untuple carried_inputs from while_loop
-  # carried_inputs = original_carried_inputs[0] # due to PyTorch has already treat them , so skip split here
+  # carried_inputs = original_carried_inputs[0] ### due to PyTorch has already treat them , so skip split here
   # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
-  if len(original_carried_inputs) == 2:
-    print("use original_carried_inputs for additional_inputs")
-    additional_inputs = original_carried_inputs[1]
+  ### due to PyTorch has already treat them , so skip split here
+  # if len(original_carried_inputs) == 2:
+  #   print("use original_carried_inputs for additional_inputs")
+  #   additional_inputs = original_carried_inputs[1]
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From f5298a59da61247956280cc274f478d0edfc6c39 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:03:27 +0000
Subject: [PATCH 182/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 8e5eead8fc3..7fd99631943 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,14 +105,14 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight

From e4104a4719837c6071ee2d1088af757f64ca0cf2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:05:51 +0000
Subject: [PATCH 183/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 7fd99631943..a4566395afe 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -101,7 +101,8 @@ def __init__(self):
             # self.register_buffer("dec", torch.tensor(1))
 
         # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value, bias_0, weight_0):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 

From 2ef7d32a0a3465de9f0bb4f8b59026df74f0b407 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:07:16 +0000
Subject: [PATCH 184/323] update

---
 ...p_with_while_loop_simple_add_dispatch_in_torch.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index a4566395afe..0bce0779c6b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,20 +100,20 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
         # def forward(self, upper, lower, one_value, x, input_value, output_value):
-        def forward(self, upper, lower, one_value, x, input_value, output_value, bias_0, weight_0):
+        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight

From 4affdd72b63047ab0c799bd6bcba595d11aaa158 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:09:27 +0000
Subject: [PATCH 185/323] update

---
 ...p_with_while_loop_simple_add_dispatch_in_torch.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0bce0779c6b..eb108d59c46 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,20 +100,20 @@ def __init__(self):
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
             # self.register_buffer("dec", torch.tensor(1))
 
-        def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-        # def forward(self, upper, lower, one_value, x, input_value, output_value):
+        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+        def forward(self, upper, lower, one_value, x, input_value, output_value):
         # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # weight_1 = self.linear.weight
             # bias_1 = self.linear.bias
 
-            def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+            def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
-            # def body_fn(upper, lower, one_value, x, input_value, output_value):
+            def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
                 weight = self.linear.weight

From 325425388cdb06b62d56c13713ee6cc3b30d6238 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:10:58 +0000
Subject: [PATCH 186/323] update

---
 torch_xla/experimental/fori_loop.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 1b874ed9db2..29735ee1d38 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -72,15 +72,15 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  # print("fake_carried_inputs first: ", fake_carried_inputs)
-  # for additional_input in additional_inputs:
-  #   device = additional_input.device
-  #   #TODO(@manfei) type = carried_input.type
-  #   fake_carried_inputs.append(
-  #       torch.randint(10, additional_input.size(),
-  #                     dtype=additional_input.dtype).to(device))
+  print("fake_carried_inputs first: ", fake_carried_inputs)
+  for additional_input in additional_inputs:
+    device = additional_input.device
+    #TODO(@manfei) type = carried_input.type
+    fake_carried_inputs.append(
+        torch.randint(10, additional_input.size(),
+                      dtype=additional_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  # # print("fake_carried_inputs second: ", fake_carried_inputs)
+  print("fake_carried_inputs second: ", fake_carried_inputs)
 
   print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation

From 92887e70e25c00f9789fe8e567b9f4e1b97412ae Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:12:31 +0000
Subject: [PATCH 187/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 28 ++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 145739e1d1a..fa29bfdd4c8 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -925,26 +925,26 @@ class PyLoweringContext {
       // !!! --- next step: we add dump paras according to additional_inputs_list
       // ??? --- could we get IRvalue of `additional_inputs_list` in this function to complete xlacomputation?
       int64_t parameter_idx = 2; // parameter_idx start from 2 after upper and lower
-      for (auto& additional_input_tensor : additional_inputs_list) {
-        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-        xla::Shape shape = xtensor->shape().get();
-        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameter_idx += 1;
-      }
+      // for (auto& additional_input_tensor : additional_inputs_list) {
+      //   XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+      //   xla::Shape shape = xtensor->shape().get();
+      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+      //                                 "UnusedArgumentsPlaceholder");
+      //   parameter_idx += 1;
+      // }
     }
 
     // hard-code modify body xlacomputation input arguments
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameter_idx = 7; // tensors.size();
-      for (auto& additional_input_tensor : additional_inputs_list) {
-        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-        xla::Shape shape = xtensor->shape().get();
-        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-                                      "UnusedArgumentsPlaceholder");
-        parameter_idx += 1;
-      }
+      // for (auto& additional_input_tensor : additional_inputs_list) {
+      //   XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+      //   xla::Shape shape = xtensor->shape().get();
+      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+      //                                 "UnusedArgumentsPlaceholder");
+      //   parameter_idx += 1;
+      // }
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From 98425a88746da7d5d70a26ce02a70780252159c8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:14:30 +0000
Subject: [PATCH 188/323] update

---
 torch_xla/experimental/fori_loop.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 29735ee1d38..d7f54af3ee4 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -133,7 +133,10 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   for shape in shapes:
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
+  print("args params: ", params)
+  print("!!! arrive here too after args!!!")
 
+  print("!!! arrive here too before while!!!")
   # generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
@@ -142,9 +145,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  print("!!! arrive here too after args!!!")
 
-  print("!!! arrive here too before while!!!")
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (carried_inputs),

From 862e3f16f1259819ad7d3a322080de8ef30f5916 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:18:22 +0000
Subject: [PATCH 189/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d7f54af3ee4..94080b31979 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -122,6 +122,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("!!! arrive here too after body !!!")
 
   print("!!! arrive here too before args!!!")
+  total_inputs = carried_inputs + additional_inputs
+  print("total_inputs: ", total_inputs)
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
   if type(carried_inputs) is tuple:

From 6b07e22d1d6eddaed524b89d79b4968c0a196815 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 21:20:13 +0000
Subject: [PATCH 190/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 94080b31979..27d6c214980 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -126,10 +126,10 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   print("total_inputs: ", total_inputs)
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
-  if type(carried_inputs) is tuple:
-    shapes = xb.tensor_shape(carried_inputs)
+  if type(total_inputs) is tuple:
+    shapes = xb.tensor_shape(total_inputs)
   else:
-    shapes = xb.tensor_shape((carried_inputs))
+    shapes = xb.tensor_shape((total_inputs))
   builder = xb.create_builder('test_while')
   params = []
   for shape in shapes:

From 1edc7bd448559b440036620c2004b7272eeda7e8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:10:38 +0000
Subject: [PATCH 191/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 28 ++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index fa29bfdd4c8..145739e1d1a 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -925,26 +925,26 @@ class PyLoweringContext {
       // !!! --- next step: we add dump paras according to additional_inputs_list
       // ??? --- could we get IRvalue of `additional_inputs_list` in this function to complete xlacomputation?
       int64_t parameter_idx = 2; // parameter_idx start from 2 after upper and lower
-      // for (auto& additional_input_tensor : additional_inputs_list) {
-      //   XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-      //   xla::Shape shape = xtensor->shape().get();
-      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-      //                                 "UnusedArgumentsPlaceholder");
-      //   parameter_idx += 1;
-      // }
+      for (auto& additional_input_tensor : additional_inputs_list) {
+        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+        xla::Shape shape = xtensor->shape().get();
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameter_idx += 1;
+      }
     }
 
     // hard-code modify body xlacomputation input arguments
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameter_idx = 7; // tensors.size();
-      // for (auto& additional_input_tensor : additional_inputs_list) {
-      //   XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-      //   xla::Shape shape = xtensor->shape().get();
-      //   xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-      //                                 "UnusedArgumentsPlaceholder");
-      //   parameter_idx += 1;
-      // }
+      for (auto& additional_input_tensor : additional_inputs_list) {
+        XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
+        xla::Shape shape = xtensor->shape().get();
+        xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
+                                      "UnusedArgumentsPlaceholder");
+        parameter_idx += 1;
+      }
     }
 
     // Get the backing XLA tensors from the output torch tensor handles

From 4cfa52231c1fb933fa49802e08dfce1f704d1923 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:42:01 +0000
Subject: [PATCH 192/323] update

---
 torch_xla/experimental/fori_loop.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 27d6c214980..118358ff81b 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -44,7 +44,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
-  print("!!! arrive here too !!!")
+  print("!!! arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
   # print("while_loop additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
@@ -52,7 +52,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
-  print("!!! arrive here too too !!!")
+  print("!!! arrive here def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()): !!!")
   # print("original_carried_inputs: ", original_carried_inputs)
   # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
@@ -72,7 +72,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs first: ", fake_carried_inputs)
+  # print("fake_carried_inputs first: ", fake_carried_inputs)
   for additional_input in additional_inputs:
     device = additional_input.device
     #TODO(@manfei) type = carried_input.type
@@ -80,14 +80,14 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs second: ", fake_carried_inputs)
+  # print("fake_carried_inputs second: ", fake_carried_inputs)
 
   print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation
-  print("print fake_carried_inputs: ", fake_carried_inputs)
+  # print("print fake_carried_inputs: ", fake_carried_inputs)
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
-  print("nnn here ???")
+  # print("nnn here ???")
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
@@ -135,6 +135,13 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   for shape in shapes:
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
+  tmp_bias = params[-2]
+  tmp_output_value = params[-3]
+  del params[-3]
+  del params[-2]
+  params.append(tmp_bias)
+  params.append(tmp_output_value)
+
   print("args params: ", params)
   print("!!! arrive here too after args!!!")
 

From edb6fc72624998ce5dafe747f6172dfbb7303a14 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:42:52 +0000
Subject: [PATCH 193/323] update

---
 torch_xla/experimental/fori_loop.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 118358ff81b..b44e79e090c 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -93,9 +93,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   # treat and pass additional_inputs to cond_fn
   print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
-  for i in range(len(additional_inputs)):
-    additional_inputs_list_cond.append(additional_inputs[i])
-  print("additional_inputs_list_cond two: ", additional_inputs_list_cond)
+  # for i in range(len(additional_inputs)):
+  #   additional_inputs_list_cond.append(additional_inputs[i])
+  # print("additional_inputs_list_cond two: ", additional_inputs_list_cond)
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
@@ -111,8 +111,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
   # print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
   # print("len0!!!: ", len(additional_inputs_list_body))
-  for i in range(len(additional_inputs)):
-    additional_inputs_list_body.append(additional_inputs[i])
+  # for i in range(len(additional_inputs)):
+  #   additional_inputs_list_body.append(additional_inputs[i])
   # print("len!!!: ", len(additional_inputs_list_body))
   # print("additional_inputs_list_body: ", additional_inputs_list_body)
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)

From 99a6589851b4acccd2bc0c279f3f4902e17e3173 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:50:09 +0000
Subject: [PATCH 194/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 torch_xla/experimental/fori_loop.py                           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index eb108d59c46..2629b28a543 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -116,13 +116,13 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
             def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
-                weight = self.linear.weight
+                weight = self.linear.weight # not be used actually, would be used as 
                 bias = self.linear.bias
                 # new_upper = upper
                 # new_one_value = one_value
                 # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
+                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, bias.clone(), weight.clone()
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index b44e79e090c..68e62d902ec 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -107,7 +107,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   body_result = body_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  additional_inputs_list_body = [fake_carried_inputs[-2]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
+  additional_inputs_list_body = [fake_carried_inputs[-3]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
   # TODO(@manfei): treat and pass additional_inputs to body_fn too
   # print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
   # print("len0!!!: ", len(additional_inputs_list_body))

From a34cd6fe3390039a3e45ea10dbf737828df3d797 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:53:05 +0000
Subject: [PATCH 195/323] update

---
 torch_xla/experimental/fori_loop.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 68e62d902ec..25c3dc917a7 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -63,6 +63,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   # if len(original_carried_inputs) == 2:
   #   print("use original_carried_inputs for additional_inputs")
   #   additional_inputs = original_carried_inputs[1]
+
+  # exchange order of bias and weight in additional_inputs
+  (bias_p, weight_p) = additional_inputs
+  additional_inputs = (weight_p, bias_p)
+
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From be562ad96416296b057317b0b69543a5324947ed Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:54:09 +0000
Subject: [PATCH 196/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 25c3dc917a7..337b05c75b3 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -85,7 +85,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  # print("fake_carried_inputs second: ", fake_carried_inputs)
+  print("fake_carried_inputs second: ", fake_carried_inputs)
 
   print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation

From 11c1b546ea9490a7567093ece87c4344e9c880ca Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:56:14 +0000
Subject: [PATCH 197/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 337b05c75b3..aa3730e7d7e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -64,9 +64,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   #   print("use original_carried_inputs for additional_inputs")
   #   additional_inputs = original_carried_inputs[1]
 
-  # exchange order of bias and weight in additional_inputs
-  (bias_p, weight_p) = additional_inputs
-  additional_inputs = (weight_p, bias_p)
+  # # exchange order of bias and weight in additional_inputs
+  # (bias_p, weight_p) = additional_inputs
+  # additional_inputs = (weight_p, bias_p)
 
   # fake carried_inputs to split formal code
   fake_carried_inputs = []

From 2480b5b798dadacab4fc0d7da2e1ce329224879a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 22:58:08 +0000
Subject: [PATCH 198/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 2629b28a543..c126636b59e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -122,7 +122,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 # new_one_value = one_value
                 # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, bias.clone(), weight.clone()
+                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From cce486d4e9fdd967d2883d1a711e6693ef5be9d5 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:02:13 +0000
Subject: [PATCH 199/323] update

---
 torch_xla/experimental/fori_loop.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index aa3730e7d7e..0429b97cf3a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -96,6 +96,14 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+
+  tmp_bias = additional_inputs_list_cond[-2]
+  tmp_output_value = additional_inputs_list_cond[-3]
+  del additional_inputs_list_cond[-3]
+  del additional_inputs_list_cond[-2]
+  additional_inputs_list_cond.append(tmp_bias)
+  additional_inputs_list_cond.append(tmp_output_value)
+
   # treat and pass additional_inputs to cond_fn
   print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
   # for i in range(len(additional_inputs)):
@@ -140,12 +148,12 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   for shape in shapes:
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
-  tmp_bias = params[-2]
-  tmp_output_value = params[-3]
-  del params[-3]
-  del params[-2]
-  params.append(tmp_bias)
-  params.append(tmp_output_value)
+  # tmp_bias = params[-2]
+  # tmp_output_value = params[-3]
+  # del params[-3]
+  # del params[-2]
+  # params.append(tmp_bias)
+  # params.append(tmp_output_value)
 
   print("args params: ", params)
   print("!!! arrive here too after args!!!")

From 38d30b7aa47af946acbc3ef60412c51d40cf0c78 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:02:59 +0000
Subject: [PATCH 200/323] update

---
 torch_xla/experimental/fori_loop.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0429b97cf3a..4d566b5e768 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -97,12 +97,12 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
 
-  tmp_bias = additional_inputs_list_cond[-2]
-  tmp_output_value = additional_inputs_list_cond[-3]
-  del additional_inputs_list_cond[-3]
-  del additional_inputs_list_cond[-2]
-  additional_inputs_list_cond.append(tmp_bias)
-  additional_inputs_list_cond.append(tmp_output_value)
+  tmp_bias = additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
+  tmp_output_value = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+  del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+  del additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
+  additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
+  additional_inputs_list_cond.append(tmp_output_value) # not used, change order doesn't affect logic
 
   # treat and pass additional_inputs to cond_fn
   print("additional_inputs_list_cond one: ", additional_inputs_list_cond)

From a85a8e30fb921b3cb577c935e7850332ace44757 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:04:54 +0000
Subject: [PATCH 201/323] update

---
 torch_xla/experimental/fori_loop.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4d566b5e768..61d58043175 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -98,11 +98,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
 
   tmp_bias = additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
-  tmp_output_value = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
-  del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+  # tmp_output_value = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+  # del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
   del additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
   additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
-  additional_inputs_list_cond.append(tmp_output_value) # not used, change order doesn't affect logic
+  # additional_inputs_list_cond.append(tmp_output_value) # not used, change order doesn't affect logic
 
   # treat and pass additional_inputs to cond_fn
   print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
@@ -148,11 +148,12 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   for shape in shapes:
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
-  # tmp_bias = params[-2]
+
+  tmp_bias = params[-2]
   # tmp_output_value = params[-3]
   # del params[-3]
-  # del params[-2]
-  # params.append(tmp_bias)
+  del params[-2]
+  params.append(tmp_bias)
   # params.append(tmp_output_value)
 
   print("args params: ", params)

From 8b73e4322c9edc89557646ae40d1b4f4616d3653 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:06:20 +0000
Subject: [PATCH 202/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 61d58043175..8def75a982d 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -169,6 +169,8 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
 
+  print("carried_inputs: ", carried_inputs)
+
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (carried_inputs),

From 839f5d1a1029290234dc2c31480b67a533132786 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:07:26 +0000
Subject: [PATCH 203/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 8def75a982d..828035591f4 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -173,7 +173,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
-                                                 (carried_inputs),
+                                                 (total_inputs), # (carried_inputs),
                                                  computation)
   print("!!! arrive here too after while!!!")
 

From 4b96b9b6636ea415a5b791b7794dd8e210f8e227 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:10:21 +0000
Subject: [PATCH 204/323] update

---
 torch_xla/experimental/fori_loop.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 828035591f4..46ee2578cf0 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -44,7 +44,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
-  print("!!! arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
+  # print("!!! arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
   # print("while_loop additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
@@ -52,7 +52,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 
 
 def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
-  print("!!! arrive here def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()): !!!")
+  # print("!!! arrive here def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()): !!!")
   # print("original_carried_inputs: ", original_carried_inputs)
   # print("additional_inputs: ", additional_inputs)
   # import pdb; pdb.set_trace()
@@ -85,9 +85,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
   # fake_carried_inputs = tuple(fake_carried_inputs)
-  print("fake_carried_inputs second: ", fake_carried_inputs)
+  # print("fake_carried_inputs second: ", fake_carried_inputs)
 
-  print("!!! arrive here too before cond !!!")
+  # print("!!! arrive here too before cond !!!")
   # generate cond_fn xlacomputation
   # print("print fake_carried_inputs: ", fake_carried_inputs)
   # TODO(@manfei): specify which element is for which argument like a,b,c
@@ -105,7 +105,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   # additional_inputs_list_cond.append(tmp_output_value) # not used, change order doesn't affect logic
 
   # treat and pass additional_inputs to cond_fn
-  print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
+  # print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
   # for i in range(len(additional_inputs)):
   #   additional_inputs_list_cond.append(additional_inputs[i])
   # print("additional_inputs_list_cond two: ", additional_inputs_list_cond)
@@ -113,9 +113,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  print("!!! arrive here too after cond !!!")
+  # print("!!! arrive here too after cond !!!")
 
-  print("!!! arrive here too before body !!!")
+  # print("!!! arrive here too before body !!!")
   # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
@@ -132,11 +132,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  print("!!! arrive here too after body !!!")
+  # print("!!! arrive here too after body !!!")
 
-  print("!!! arrive here too before args!!!")
+  # print("!!! arrive here too before args!!!")
   total_inputs = carried_inputs + additional_inputs
-  print("total_inputs: ", total_inputs)
+  # print("total_inputs: ", total_inputs)
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   kwargs = {}
   if type(total_inputs) is tuple:
@@ -156,10 +156,10 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   params.append(tmp_bias)
   # params.append(tmp_output_value)
 
-  print("args params: ", params)
-  print("!!! arrive here too after args!!!")
+  # print("args params: ", params)
+  # print("!!! arrive here too after args!!!")
 
-  print("!!! arrive here too before while!!!")
+  # print("!!! arrive here too before while!!!")
   # generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
@@ -169,7 +169,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
 
-  print("carried_inputs: ", carried_inputs)
+  # print("carried_inputs: ", carried_inputs)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From d1e141a04509d88ff442bf4c50cc8f27c577b9ab Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:12:15 +0000
Subject: [PATCH 205/323] update

---
 torch_xla/experimental/fori_loop.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 46ee2578cf0..4bdf3d6c70b 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -170,11 +170,12 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   computation = w.build(name)
 
   # print("carried_inputs: ", carried_inputs)
+  print("total_inputs: ", total_inputs)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
-                                                 (total_inputs), # (carried_inputs),
+                                                 (total_inputs),
                                                  computation)
-  print("!!! arrive here too after while!!!")
+  # print("!!! arrive here too after while!!!")
 
   return result
\ No newline at end of file

From 5f73913d30833b0bb69799bff3842076c06b5248 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:12:30 +0000
Subject: [PATCH 206/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c126636b59e..aeef18c9841 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,7 +109,7 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             def cond_fn(upper, lower, one_value, x, input_value, output_value):
-                return lower[0] < upper[0]
+                return lower[0] >= upper[0]
 
             # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):

From 3fc27584f6f6c0b9ba66f33f2d1063f3932d3468 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:16:12 +0000
Subject: [PATCH 207/323] update

---
 ...p_with_while_loop_simple_add_dispatch_in_torch.py | 12 ++++++++++--
 torch_xla/experimental/fori_loop.py                  |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index aeef18c9841..c5c2765dbcd 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -154,8 +154,16 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
     # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
-    bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    print("bbb: ", bbb)
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    # print("bbb: ", bbb)
+    print("upper__: ", upper__)
+    print("lower__: ", lower__)
+    print("one_value__: ", one_value__)
+    print("torch_add_res__: ", torch_add_res__)
+    print("input_value__: ", input_value__)
+    print("output_value_real__: ", output_value_real__)
+    print("weight__: ", weight__)
+    print("bias__: ", bias__)
     # print("start test 6 !!!")
     return aaa
 
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4bdf3d6c70b..6ad00f67c7a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -170,7 +170,7 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   computation = w.build(name)
 
   # print("carried_inputs: ", carried_inputs)
-  print("total_inputs: ", total_inputs)
+  # print("total_inputs: ", total_inputs)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From 191b6664bdff72c9a27ba0197f0b2a7019c1b533 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:16:40 +0000
Subject: [PATCH 208/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c5c2765dbcd..844175ac050 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -162,8 +162,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     print("torch_add_res__: ", torch_add_res__)
     print("input_value__: ", input_value__)
     print("output_value_real__: ", output_value_real__)
-    print("weight__: ", weight__)
-    print("bias__: ", bias__)
+    # print("weight__: ", weight__)
+    # print("bias__: ", bias__)
     # print("start test 6 !!!")
     return aaa
 

From d6935186149db6ca216a77fdfd391c95844280a0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:20:05 +0000
Subject: [PATCH 209/323] update

---
 torch_xla/experimental/fori_loop.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 6ad00f67c7a..fb7b37b21a8 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -113,6 +113,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
+  cond_hlo_print = xb.get_computation_hlo(cond_computation)
+  print("cond computation: !!!!!!!!!")
+  print(cond_hlo_print)
   # print("!!! arrive here too after cond !!!")
 
   # print("!!! arrive here too before body !!!")
@@ -132,6 +135,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
+  body_hlo_print = xb.get_computation_hlo(body_computation)
+  print("body computation: !!!!!!!!!")
+  print(body_hlo_print)
   # print("!!! arrive here too after body !!!")
 
   # print("!!! arrive here too before args!!!")
@@ -168,6 +174,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
+  hlo_print = xb.get_computation_hlo(computation)
+  print("while computation: !!!!!!!!!")
+  print(hlo_print)
 
   # print("carried_inputs: ", carried_inputs)
   # print("total_inputs: ", total_inputs)

From f8cc89ec788aaab74bc1b652e0f3359e083d9d76 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:29:37 +0000
Subject: [PATCH 210/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 844175ac050..e13449266ea 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -122,7 +122,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
                 # new_one_value = one_value
                 # new_input_value = input_value
                 # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
-                return upper.clone(), lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
+                return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
             # return while_loop(cond_fn, body_fn, (iter, x))
             # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
             # return 1

From 089a27f9b09bc34aefc1546c3db511ab4088b685 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:30:16 +0000
Subject: [PATCH 211/323] update

---
 torch_xla/experimental/fori_loop.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index fb7b37b21a8..14ceea7af04 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -113,9 +113,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  cond_hlo_print = xb.get_computation_hlo(cond_computation)
-  print("cond computation: !!!!!!!!!")
-  print(cond_hlo_print)
+  # cond_hlo_print = xb.get_computation_hlo(cond_computation)
+  # print("cond computation: !!!!!!!!!")
+  # print(cond_hlo_print)
   # print("!!! arrive here too after cond !!!")
 
   # print("!!! arrive here too before body !!!")
@@ -135,9 +135,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  body_hlo_print = xb.get_computation_hlo(body_computation)
-  print("body computation: !!!!!!!!!")
-  print(body_hlo_print)
+  # body_hlo_print = xb.get_computation_hlo(body_computation)
+  # print("body computation: !!!!!!!!!")
+  # print(body_hlo_print)
   # print("!!! arrive here too after body !!!")
 
   # print("!!! arrive here too before args!!!")
@@ -174,9 +174,9 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  hlo_print = xb.get_computation_hlo(computation)
-  print("while computation: !!!!!!!!!")
-  print(hlo_print)
+  # hlo_print = xb.get_computation_hlo(computation)
+  # print("while computation: !!!!!!!!!")
+  # print(hlo_print)
 
   # print("carried_inputs: ", carried_inputs)
   # print("total_inputs: ", total_inputs)

From 3a1fcf1a581708ea4e7345f73e08ee6b4bbb4de8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:30:59 +0000
Subject: [PATCH 212/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e13449266ea..4d85222ef01 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,7 +109,7 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             def cond_fn(upper, lower, one_value, x, input_value, output_value):
-                return lower[0] >= upper[0]
+                return lower[0] <= upper[0]
 
             # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):

From ea49168ee29839b2146c8d755c8187464070a30f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:31:56 +0000
Subject: [PATCH 213/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 4d85222ef01..cb33020965c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -168,6 +168,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     return aaa
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+    print("expected: ", expected)
 
     self.assertTrue(torch.all(torch.eq(expected, l_out_)))
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))

From ab065d5f8e84daf45ab7fcbe54e1889341472474 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:32:39 +0000
Subject: [PATCH 214/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index cb33020965c..c31b8bf0e78 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -165,12 +165,12 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # print("weight__: ", weight__)
     # print("bias__: ", bias__)
     # print("start test 6 !!!")
-    return aaa
+    # return aaa
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
 
-    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+    return self.assertTrue(torch.all(torch.eq(expected, l_out_)))
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     # print("res: ", res)
     # import pdb; pdb.set_trace()

From 060eaace1bf7950d71e4b162c318f23170a210fb Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:33:04 +0000
Subject: [PATCH 215/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c31b8bf0e78..9beffd3fe94 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -170,7 +170,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
 
-    return self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+    return aaa
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     # print("res: ", res)
     # import pdb; pdb.set_trace()

From b649e7e8fb93847409dd105927f02347c3adb319 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:35:20 +0000
Subject: [PATCH 216/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 9beffd3fe94..b82d0f3b994 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -167,6 +167,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # print("start test 6 !!!")
     # return aaa
 
+    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    linear_0.weight_.data = weight__
+    linear_0.bias_.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
 

From 1d901a3dd66e1e0cbb2886f177178358682fb078 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:35:53 +0000
Subject: [PATCH 217/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b82d0f3b994..144fc024549 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -168,8 +168,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # return aaa
 
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    linear_0.weight_.data = weight__
-    linear_0.bias_.data = bias__
+    linear_0.weight.data = weight__
+    linear_0.bias.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
 

From b62ba4621d768bd565e0ca729fb431c042d21dc0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:36:31 +0000
Subject: [PATCH 218/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 144fc024549..34906cc363d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -173,7 +173,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
 
-    self.assertTrue(torch.all(torch.eq(expected, l_out_)))
+    self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     # print("res: ", res)

From b0f18f6285075e8e31199bf58da4b564f0b7af1a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:37:49 +0000
Subject: [PATCH 219/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 34906cc363d..401e17f2ca4 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -136,7 +136,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
-    upper = torch.tensor([52], dtype=torch.int32, device=device)
+    upper = torch.tensor([2], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
@@ -172,6 +172,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     linear_0.bias.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
+    print("l_in_0: ", l_in_0)
 
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa

From a37f72beff8271fe8f02ee799644b9ad8f32c475 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:38:49 +0000
Subject: [PATCH 220/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 401e17f2ca4..c79631e94a7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -109,7 +109,7 @@ def forward(self, upper, lower, one_value, x, input_value, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             def cond_fn(upper, lower, one_value, x, input_value, output_value):
-                return lower[0] <= upper[0]
+                return lower[0] < upper[0]
 
             # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
             # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):

From 36fa72effccd2a560909f8579c729a4104350062 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:41:41 +0000
Subject: [PATCH 221/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 46 ++-----------------
 1 file changed, 5 insertions(+), 41 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c79631e94a7..e569ebf62dc 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -98,83 +98,47 @@ class SimpleWithLinear(torch.nn.Module):
         def __init__(self):
             super().__init__()
             self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-            # self.register_buffer("dec", torch.tensor(1))
 
-        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
         def forward(self, upper, lower, one_value, x, input_value, output_value):
-        # def forward(self, upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            # weight_1 = self.linear.weight
-            # bias_1 = self.linear.bias
-
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            # def cond_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             def cond_fn(upper, lower, one_value, x, input_value, output_value):
                 return lower[0] < upper[0]
 
-            # def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
-            # def body_fn(upper, lower, one_value, x, input_value, weight_1, bias_1, output_value):
             def body_fn(upper, lower, one_value, x, input_value, output_value):
                 new_lower = torch.add(one_value, lower)
                 output_value_real = self.linear(input_value)
-                weight = self.linear.weight # not be used actually, would be used as 
-                bias = self.linear.bias
-                # new_upper = upper
-                # new_one_value = one_value
-                # new_input_value = input_value
-                # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value_real
+                weight = self.linear.weight # not be used actually, initialized as placeholder xlacomputation requirement 
+                bias = self.linear.bias # not be used actually, initialized as placeholder xlacomputation requirement 
                 return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
-            # return while_loop(cond_fn, body_fn, (iter, x))
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-            # return 1
-            # return upper, lower, one_value, x, input_value, output_value
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_0, bias_0, output_value))
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, weight_1, bias_1, output_value))
-            # weight_1 = self.linear.weight
-            # bias_1 = self.linear.bias
-            # return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value), (bias_1, weight_1))
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
-            # return _xla_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([2], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
-    init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
-    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
+    init_val = torch.tensor([1], dtype=torch.int32, device=device)
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     weight_0 = simple_with_linear.linear.weight
     bias_0 = simple_with_linear.linear.bias
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
-    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = aaa
-    # print("aaa: ", aaa)
-    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value) # , weight_0, bias_0)
-    # bbb = simple_with_linear(upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value)
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    # print("bbb: ", bbb)
     print("upper__: ", upper__)
     print("lower__: ", lower__)
     print("one_value__: ", one_value__)
     print("torch_add_res__: ", torch_add_res__)
     print("input_value__: ", input_value__)
     print("output_value_real__: ", output_value_real__)
-    # print("weight__: ", weight__)
-    # print("bias__: ", bias__)
-    # print("start test 6 !!!")
-    # return aaa
 
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     linear_0.weight.data = weight__
     linear_0.bias.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
-    print("l_in_0: ", l_in_0)
+    # print("l_in_0: ", l_in_0)
 
-    self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
+    self.assertTrue(torch.all(torch.eq(expected, l_in_0)))
     return aaa
     # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
     # print("res: ", res)

From 6312c498b26ff41f25ef5c4b0b3cf15e24742be5 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:42:31 +0000
Subject: [PATCH 222/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e569ebf62dc..9a88267c030 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -136,14 +136,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     linear_0.bias.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
     print("expected: ", expected)
-    # print("l_in_0: ", l_in_0)
 
-    self.assertTrue(torch.all(torch.eq(expected, l_in_0)))
+    self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa
-    # res = simple_with_linear.apply((upper, lower, one_value, init_val, l_in_0, output_value))
-    # print("res: ", res)
-    # import pdb; pdb.set_trace()
-    # return {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
 
   def test_fori_loop_tpu_addition(self):
 

From 4be4c237e526719cfe7f1d7d3076f20bb4a45273 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:48:22 +0000
Subject: [PATCH 223/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 43 --------
 torch_xla/experimental/fori_loop.py           | 99 +++----------------
 2 files changed, 16 insertions(+), 126 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 9a88267c030..a3e684f3069 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -7,7 +7,6 @@
 # We need to import the underlying implementation function to register with the dispatcher
 import torch_xla.experimental.fori_loop
 from torch_xla.experimental.fori_loop import fori_loop
-# from torch_xla.experimental.fori_loop import _xla_while_loop
 from torch._higher_order_ops.while_loop import while_loop
 import torch_xla.core.xla_model as xm
 import torch_xla.core.xla_builder as xb
@@ -24,7 +23,6 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
   if len(init_val) > 1:
     (a, b) = init_val
     for i in range((upper - lower)[0]):
-      # a = body_fun(a, b)
       a = body_fun(*init_val)
   else:
     for i in range((upper - lower)[0]):
@@ -180,44 +178,3 @@ def test_fori_loop_tpu_simple_linear(self):
 if __name__ == '__main__':
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
-
-
-######## ---------------------------------------------------------
-
-#     x = torch.zeros(1)
-#     y = torch.zeros(1)
-#     z = torch.zeros(1)
-#     return {"simple_with_linear": (simple_with_linear, (torch.tensor(3), torch.randn(2, 2)))}
-
-    # xm.mark_step()
-    # device = xm.xla_device()
-    # torch.set_grad_enabled(False)
-
-    # upper = torch.tensor([52], dtype=torch.int32, device=device)
-    # lower = torch.tensor([0], dtype=torch.int32, device=device)
-    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
-    # init_val = torch.tensor([1], dtype=torch.int32, device=device) # x
-    # l_in_0 = torch.randn(10, device=xm.xla_device()) # input_value
-    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
-
-    # linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    # weight_0 = linear_0.weight
-    # bias_0 = linear_0.bias
-
-    # # def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
-    # def cond_fn(upper, lower, one_value, x, input_value, output_value):
-    #   return lower[0] < upper[0]
-
-    # def body_fn(upper, lower, one_value, x, input_value, output_value):
-    #   new_lower = torch.add(one_value, lower)
-    #   output_value = linear_0(input_value)
-    #   weight = linear_0.weight
-    #   bias = linear_0.bias
-    #   return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-
-    # # print("!!! arrive here !!!")
-    # upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-
-    # expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
-
-    # self.assertTrue(torch.all(torch.eq(expected, l_out_)))
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 14ceea7af04..baa2dcebed7 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -17,18 +17,14 @@ def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
 
-  def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0): # , bias_0): # , output_value):
+  def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
     return lower[0] < upper[0]
 
-  def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0): # , bias_0):
-    # weight = body_fun.weight
-    new_lower = torch.add(one_value, lower) ### !!! this matter, torch.add might would change the second argument's value, even we use a new variable to catch the result!!!
-    output_value = body_fun(*input_value) ### !!! due to the output_value is not actually used here, 
-    # --- !!! its original value would not be used, and it would be replaces by the result of body_fun
-    # --- !!! so, due to PTXLA is traced from result tensor, so the arguments `output_value` would not be included in the body_xlacomputation
-    # --- !!! so, we need to modify ini_python_binding.cpp to add a fake arguments in the xlacompputation
-    weight = body_fun.weight
-    bias = body_fun.bias
+  def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+    new_lower = torch.add(one_value, lower)
+    output_value = body_fun(*input_value)
+    weight = body_fun.weight  # not be used actually, initialized as placeholder xlacomputation requirement
+    bias = body_fun.bias  # not be used actually, initialized as placeholder xlacomputation requirement
     return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
@@ -40,34 +36,16 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
 
 
 @while_loop_op.py_impl(DispatchKey.XLA)
-def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
-  # print("!!! arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
-  # print("while_loop additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
-  return _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=additional_inputs) #  a=a, b=b, c=c,
-
-
-def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
-  # print("!!! arrive here def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()): !!!")
-  # print("original_carried_inputs: ", original_carried_inputs)
-  # print("additional_inputs: ", additional_inputs)
-  # import pdb; pdb.set_trace()
-  # untuple carried_inputs from while_loop
-  # carried_inputs = original_carried_inputs[0] ### due to PyTorch has already treat them , so skip split here
-  # TODO(@manfei): please clear pass additional_inputs in `while_loop`'s defination in this file
-  ### due to PyTorch has already treat them , so skip split here
-  # if len(original_carried_inputs) == 2:
-  #   print("use original_carried_inputs for additional_inputs")
-  #   additional_inputs = original_carried_inputs[1]
-
-  # # exchange order of bias and weight in additional_inputs
-  # (bias_p, weight_p) = additional_inputs
-  # additional_inputs = (weight_p, bias_p)
+  return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
 
+
+def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:
@@ -76,74 +54,41 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
-  # fake_carried_inputs = tuple(fake_carried_inputs)
-  # print("fake_carried_inputs first: ", fake_carried_inputs)
   for additional_input in additional_inputs:
     device = additional_input.device
     #TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
-  # fake_carried_inputs = tuple(fake_carried_inputs)
-  # print("fake_carried_inputs second: ", fake_carried_inputs)
 
-  # print("!!! arrive here too before cond !!!")
-  # generate cond_fn xlacomputation
-  # print("print fake_carried_inputs: ", fake_carried_inputs)
   # TODO(@manfei): specify which element is for which argument like a,b,c
-  cond_result = cond_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-2], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-1])
-  # print("nnn here ???")
+  cond_result = cond_fn(*fake_carried_inputs)
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
 
   tmp_bias = additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
-  # tmp_output_value = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
-  # del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
   del additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
   additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
-  # additional_inputs_list_cond.append(tmp_output_value) # not used, change order doesn't affect logic
 
-  # treat and pass additional_inputs to cond_fn
-  # print("additional_inputs_list_cond one: ", additional_inputs_list_cond)
-  # for i in range(len(additional_inputs)):
-  #   additional_inputs_list_cond.append(additional_inputs[i])
-  # print("additional_inputs_list_cond two: ", additional_inputs_list_cond)
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  # cond_hlo_print = xb.get_computation_hlo(cond_computation)
-  # print("cond computation: !!!!!!!!!")
-  # print(cond_hlo_print)
-  # print("!!! arrive here too after cond !!!")
 
-  # print("!!! arrive here too before body !!!")
   # generate body_fn xlacomputation
-  body_result = body_fn(*fake_carried_inputs) # [:-3], weight_0=fake_carried_inputs[-1], output_value=fake_carried_inputs[-3], bias_0=fake_carried_inputs[-2])
+  body_result = body_fn(*fake_carried_inputs)
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
-  additional_inputs_list_body = [fake_carried_inputs[-3]] # missed arguments due to given output_value was not used and PyTorch/XLA trace xlacomputation from output tensor
-  # TODO(@manfei): treat and pass additional_inputs to body_fn too
-  # print("list(fake_carried_inputs[-2]: ", fake_carried_inputs[-2])
-  # print("len0!!!: ", len(additional_inputs_list_body))
-  # for i in range(len(additional_inputs)):
-  #   additional_inputs_list_body.append(additional_inputs[i])
-  # print("len!!!: ", len(additional_inputs_list_body))
-  # print("additional_inputs_list_body: ", additional_inputs_list_body)
+  additional_inputs_list_body = [fake_carried_inputs[-3]]
+  # TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  # body_hlo_print = xb.get_computation_hlo(body_computation)
-  # print("body computation: !!!!!!!!!")
-  # print(body_hlo_print)
-  # print("!!! arrive here too after body !!!")
 
-  # print("!!! arrive here too before args!!!")
-  total_inputs = carried_inputs + additional_inputs
-  # print("total_inputs: ", total_inputs)
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
+  total_inputs = carried_inputs + additional_inputs
   kwargs = {}
   if type(total_inputs) is tuple:
     shapes = xb.tensor_shape(total_inputs)
@@ -155,17 +100,11 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
 
+  # TODO(@manfei): treat hard-code input arguments
   tmp_bias = params[-2]
-  # tmp_output_value = params[-3]
-  # del params[-3]
   del params[-2]
   params.append(tmp_bias)
-  # params.append(tmp_output_value)
 
-  # print("args params: ", params)
-  # print("!!! arrive here too after args!!!")
-
-  # print("!!! arrive here too before while!!!")
   # generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
@@ -174,12 +113,6 @@ def _xla_while_loop(cond_fn, body_fn, *carried_inputs, additional_inputs=()):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  # hlo_print = xb.get_computation_hlo(computation)
-  # print("while computation: !!!!!!!!!")
-  # print(hlo_print)
-
-  # print("carried_inputs: ", carried_inputs)
-  # print("total_inputs: ", total_inputs)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From e3a2fcd8a1daca469296706153c01f23112a8132 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:51:14 +0000
Subject: [PATCH 224/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index a3e684f3069..111b4203b8b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -122,13 +122,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
-    print("upper__: ", upper__)
-    print("lower__: ", lower__)
-    print("one_value__: ", one_value__)
-    print("torch_add_res__: ", torch_add_res__)
-    print("input_value__: ", input_value__)
-    print("output_value_real__: ", output_value_real__)
 
+    # same weight/bias liear model
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     linear_0.weight.data = weight__
     linear_0.bias.data = bias__

From abb30456f68e4929070317be8c429572d7d471e3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:52:15 +0000
Subject: [PATCH 225/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 111b4203b8b..d06c8e8d56c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -128,7 +128,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     linear_0.weight.data = weight__
     linear_0.bias.data = bias__
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
-    print("expected: ", expected)
 
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa

From 95608ff09b4c54d31a19d52b3e4ba7c95930ca72 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:53:58 +0000
Subject: [PATCH 226/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index d06c8e8d56c..88243eee9b4 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -171,4 +171,4 @@ def test_fori_loop_tpu_simple_linear(self):
 
 if __name__ == '__main__':
   test = unittest.main()
-  sys.exit(0 if test.result.wasSuccessful() else 1)
+  sys.exit(0 if test.result.wasSuccessful() else 1)
\ No newline at end of file

From c8d722ee54bb6c74ba5170eecbd2adb14ec4b266 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Mon, 15 Apr 2024 23:59:20 +0000
Subject: [PATCH 227/323] update

---
 torch_xla/csrc/init_python_bindings.cpp | 2 ++
 torch_xla/experimental/fori_loop.py     | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 145739e1d1a..c7013569dd1 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -917,6 +917,7 @@ class PyLoweringContext {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       // hard-code parameter_idx to 2 to skip existing upper/lower arguments
+      // TODO(@manfei): get body xlacomputation arguments' number first then decide items in `additional_inputs_list`, maybe implement in python level
       // !!! since cond_fn only compare upper and lower, so it would only use two arguments, due to PyTorch/XLA
       // !!! trace xlacomputation from result tensor, so all the other arguments would not be included or generated;
       // !!! but to meet xla::while requirement, we would skip first two arguments,
@@ -935,6 +936,7 @@ class PyLoweringContext {
     }
 
     // hard-code modify body xlacomputation input arguments
+    // TODO(@manfei): get body xlacomputation arguments' number first then decide items in `additional_inputs_list`, maybe implement in python level
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameter_idx = 7; // tensors.size();
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index baa2dcebed7..a0f8024333d 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -75,6 +75,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
+  cond_hlo_print = xb.get_computation_hlo(cond_computation)
+  print("cond computation: !!!!!!!!!")
+  print(cond_hlo_print)
 
   # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs)
@@ -86,6 +89,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
+  body_hlo_print = xb.get_computation_hlo(body_computation)
+  print("body computation: !!!!!!!!!")
+  print(body_hlo_print)
 
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
@@ -113,6 +119,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
+  hlo_print = xb.get_computation_hlo(computation)
+  print("while computation: !!!!!!!!!")
+  print(hlo_print)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From d33f14c82ffa25cdfff6ea22add19498a973f126 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:06:12 +0000
Subject: [PATCH 228/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 88243eee9b4..c7e5b9392e7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -86,12 +86,52 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+# passed
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
 
+    # def forward(self, upper, lower, one_value, x, input_value, output_value):
+    linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
+      return lower[0] < upper[0]
+
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
+      new_lower = torch.add(one_value, lower)
+      output_value_real = linear_0(input_value)
+      weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
+      bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
+
+    simple_with_linear = SimpleWithLinear()
+    upper = torch.tensor([2], dtype=torch.int32, device=device)
+    lower = torch.tensor([0], dtype=torch.int32, device=device)
+    one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    init_val = torch.tensor([1], dtype=torch.int32, device=device)
+    l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
+    output_value = torch.zeros([20], dtype=torch.float32, device=device)
+
+    # weight_0 = simple_with_linear.linear.weight
+    # bias_0 = simple_with_linear.linear.bias
+
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = 
+      while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+
+    expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
+
+    self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
+    return aaa
+
+# passed
+  def test_while_loop_tpu_simple_linear_class(self):
+
+    xm.mark_step()
+    device = xm.xla_device()
+    torch.set_grad_enabled(False)
+
     class SimpleWithLinear(torch.nn.Module):
         def __init__(self):
             super().__init__()

From 1f8002a07893545be707b11f4cef5091f721ed59 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:07:05 +0000
Subject: [PATCH 229/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c7e5b9392e7..e33cd21e476 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -117,8 +117,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # weight_0 = simple_with_linear.linear.weight
     # bias_0 = simple_with_linear.linear.bias
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = 
-      while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 5cffda24f659ed8cdd1b4abd3092b7a03ff82494 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:07:41 +0000
Subject: [PATCH 230/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e33cd21e476..304ce22b96e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -106,7 +106,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
 
-    simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([2], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
@@ -114,9 +113,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    # weight_0 = simple_with_linear.linear.weight
-    # bias_0 = simple_with_linear.linear.bias
-
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)

From f9cd4dc55bdf180c85d68b20bff8a64cbb412a84 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:08:49 +0000
Subject: [PATCH 231/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 304ce22b96e..db73f40a87c 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 1fc141409d24c83b5e7c80dcd64bb0484aeae7de Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:09:17 +0000
Subject: [PATCH 232/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index db73f40a87c..436e5a68b55 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,7 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 90481612e5e838118ce7084834507b57f1d42250 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:10:07 +0000
Subject: [PATCH 233/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 436e5a68b55..411d2d9b41e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -117,6 +117,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
+    print("output_value_real__: ", output_value_real__)
+    print("expected: ", expected)
+
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa
 

From 614a57fe461cd79be05a6d5e292aa19963d0b3b8 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:10:46 +0000
Subject: [PATCH 234/323] update

---
 torch_xla/experimental/fori_loop.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a0f8024333d..b8b0583e10e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -75,9 +75,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  cond_hlo_print = xb.get_computation_hlo(cond_computation)
-  print("cond computation: !!!!!!!!!")
-  print(cond_hlo_print)
+  # cond_hlo_print = xb.get_computation_hlo(cond_computation)
+  # print("cond computation: !!!!!!!!!")
+  # print(cond_hlo_print)
 
   # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs)
@@ -89,9 +89,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  body_hlo_print = xb.get_computation_hlo(body_computation)
-  print("body computation: !!!!!!!!!")
-  print(body_hlo_print)
+  # body_hlo_print = xb.get_computation_hlo(body_computation)
+  # print("body computation: !!!!!!!!!")
+  # print(body_hlo_print)
 
   # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
@@ -119,9 +119,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  hlo_print = xb.get_computation_hlo(computation)
-  print("while computation: !!!!!!!!!")
-  print(hlo_print)
+  # hlo_print = xb.get_computation_hlo(computation)
+  # print("while computation: !!!!!!!!!")
+  # print(hlo_print)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From eded2acc639cd03d600b2e60ac25d21b68c24ce3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:12:27 +0000
Subject: [PATCH 235/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 411d2d9b41e..738d592035a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -148,7 +148,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
             return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
 
     simple_with_linear = SimpleWithLinear()
-    upper = torch.tensor([2], dtype=torch.int32, device=device)
+    upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)

From 504ea2ba74e2a599a936d161d371b6ddc1cd1da3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:13:21 +0000
Subject: [PATCH 236/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 738d592035a..aef680f2edf 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -86,7 +86,6 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
-# passed
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
@@ -106,7 +105,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
 
-    upper = torch.tensor([2], dtype=torch.int32, device=device)
+    upper = torch.tensor([1], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)

From b5889a85f4838fd504afb974e87b16b9e10e9b06 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:14:47 +0000
Subject: [PATCH 237/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index aef680f2edf..f75a87b8cbc 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -116,6 +116,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
+    print("torch_add_res__: ", torch_add_res__)
     print("output_value_real__: ", output_value_real__)
     print("expected: ", expected)
 

From a572c1b6662fd0d59c6aa3e2c7f2cab32a233231 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:15:24 +0000
Subject: [PATCH 238/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index f75a87b8cbc..287797bab62 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -105,7 +105,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
 
-    upper = torch.tensor([1], dtype=torch.int32, device=device)
+    upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)

From aeda7f78f91f70dc235fd4a6dc824c8c1f687669 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:16:07 +0000
Subject: [PATCH 239/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 287797bab62..ef01d7213d0 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -103,7 +103,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From c6344a9aa60e763849323fd1910afd7bbd6e4fa0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:17:50 +0000
Subject: [PATCH 240/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ef01d7213d0..287797bab62 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -103,7 +103,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index b8b0583e10e..0160fca2d5a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -119,9 +119,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  # hlo_print = xb.get_computation_hlo(computation)
-  # print("while computation: !!!!!!!!!")
-  # print(hlo_print)
+  hlo_print = xb.get_computation_hlo(computation)
+  print("while computation: !!!!!!!!!")
+  print(hlo_print)
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From 2e7f5fb0beed5539ab2aeab7ba26f439716cc327 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:26:53 +0000
Subject: [PATCH 241/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 287797bab62..e60de52f2bb 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -94,16 +94,18 @@ def test_while_loop_tpu_simple_linear(self):
 
     # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
+    weight_0 = linear_0.weight_
+    bias_0 = linear_0.bias_
 
-    def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
       return lower[0] < upper[0]
 
-    def body_fn(upper, lower, one_value, x, input_value, output_value):
+    def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
       new_lower = torch.add(one_value, lower)
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), output_value_real, bias.clone()
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From abc2c9c77b971e6a0cdbff3c7e8a5c3f4d1349cd Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 17:27:27 +0000
Subject: [PATCH 242/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e60de52f2bb..ec29fa9e0f7 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -94,8 +94,8 @@ def test_while_loop_tpu_simple_linear(self):
 
     # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    weight_0 = linear_0.weight_
-    bias_0 = linear_0.bias_
+    weight_0 = linear_0.weight
+    bias_0 = linear_0.bias
 
     def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
       return lower[0] < upper[0]

From c7d46545bf0a59795b675a55a0a695541f732d52 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:14:45 +0000
Subject: [PATCH 243/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ec29fa9e0f7..01178fb6fa5 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -86,6 +86,7 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+# debugging
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
@@ -105,7 +106,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), output_value_real, bias.clone()
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 5dbbffb396f37cfd27aaa8557bdf592c194a7fb1 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:18:37 +0000
Subject: [PATCH 244/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0160fca2d5a..d4fc5c96d09 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -40,12 +40,14 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
+  print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
 
 
 def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
+  print("arrive here _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None): !!!")
   # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From 5fca07d41090cd6424fa8aa3c0b1220c79c6ac26 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:28:38 +0000
Subject: [PATCH 245/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 01178fb6fa5..ba045688258 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,7 +115,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 35ed14dd5da3cb5a8d5cf221840fb0cc0c54145e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:33:44 +0000
Subject: [PATCH 246/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d4fc5c96d09..418080ed90a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -41,6 +41,8 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # cond_fn&body_fn: callable
   # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
+  print("carried_inputs: ", carried_inputs)
+  print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)

From b826cf9b0ec0f437e00cf340f374d82b9a76fd72 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:38:07 +0000
Subject: [PATCH 247/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ba045688258..d1028fea36b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,7 +115,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, weight_0, bias_0, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value, weight_0, bias_0))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From df34a6bd967a0ecae8e94f6f7deaf5acbf9f74ce Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:41:37 +0000
Subject: [PATCH 248/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index d1028fea36b..f16abca316b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -115,7 +115,7 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_va
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value, weight_0, bias_0))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From fde54ac7038908d24042a3460d86be619b2b954e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 18:43:00 +0000
Subject: [PATCH 249/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index f16abca316b..0ac78cffbda 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -98,10 +98,10 @@ def test_while_loop_tpu_simple_linear(self):
     weight_0 = linear_0.weight
     bias_0 = linear_0.bias
 
-    def cond_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
 
-    def body_fn(upper, lower, one_value, x, input_value, weight_0, bias_0, output_value):
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 

From 1f2f50e21d00cfc06d1358814b483eaa37a7f90f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 19:28:54 +0000
Subject: [PATCH 250/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py |  2 +-
 torch_xla/experimental/fori_loop.py           | 46 ++++++++++---------
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0ac78cffbda..511f8a8d853 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -106,7 +106,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone() # , output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 418080ed90a..a69dd8d99df 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,7 +12,7 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 
 
-# TODO(@manfei): delete one_value?
+### TODO(@manfei): delete one_value?
 def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
@@ -23,8 +23,8 @@ def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
   def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
     new_lower = torch.add(one_value, lower)
     output_value = body_fun(*input_value)
-    weight = body_fun.weight  # not be used actually, initialized as placeholder xlacomputation requirement
-    bias = body_fun.bias  # not be used actually, initialized as placeholder xlacomputation requirement
+    weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
+    bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
     return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
@@ -37,9 +37,9 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
 
 @while_loop_op.py_impl(DispatchKey.XLA)
 def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
-  # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
-  # cond_fn&body_fn: callable
-  # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
+  ### TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
+  ### cond_fn&body_fn: callable
+  ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
   print("carried_inputs: ", carried_inputs)
   print("additional_inputs: ", additional_inputs)
@@ -50,30 +50,30 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
 
 def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   print("arrive here _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None): !!!")
-  # fake carried_inputs to split formal code
+  ### fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    #TODO(@manfei) type = carried_input.type
+    ###TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   for additional_input in additional_inputs:
     device = additional_input.device
-    #TODO(@manfei) type = carried_input.type
+    ###TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
 
-  # TODO(@manfei): specify which element is for which argument like a,b,c
+  ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
-  additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
-
-  tmp_bias = additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
-  del additional_inputs_list_cond[-2] # not used, change order doesn't affect logic
-  additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
+  # !!! cond xlacomputation change !!! switch bias and weight position
+  additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  tmp_bias = additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
+  del additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
+  additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
@@ -83,12 +83,13 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # print("cond computation: !!!!!!!!!")
   # print(cond_hlo_print)
 
-  # generate body_fn xlacomputation
+  ### generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs)
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
+  # !!! body xlacomputation change !!! add output_value argument
   additional_inputs_list_body = [fake_carried_inputs[-3]]
-  # TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
+  ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
@@ -97,7 +98,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # print("body computation: !!!!!!!!!")
   # print(body_hlo_print)
 
-  # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
+  ### trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
   kwargs = {}
   if type(total_inputs) is tuple:
@@ -110,12 +111,13 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
 
-  # TODO(@manfei): treat hard-code input arguments
+  ### TODO(@manfei): treat hard-code input arguments
+  # !!! init change !!! 
   tmp_bias = params[-2]
   del params[-2]
   params.append(tmp_bias)
 
-  # generate while xlacomputation
+  ### generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
       'While', (input_tuple.op,),
@@ -127,10 +129,10 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   print("while computation: !!!!!!!!!")
   print(hlo_print)
 
-  # gain final result with generated while xlacomputation
+  ### gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (total_inputs),
                                                  computation)
-  # print("!!! arrive here too after while!!!")
+  ### print("!!! arrive here too after while!!!")
 
   return result
\ No newline at end of file

From 27f20b7489f2592d51a0c13781a1c304de831c5b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 22:36:39 +0000
Subject: [PATCH 251/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 511f8a8d853..33b11e8eafe 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -106,7 +106,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value_real = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone() # , output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 43af2152df0ad7a0ac87bdf4f5b58f6c60431cbe Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Tue, 16 Apr 2024 22:38:17 +0000
Subject: [PATCH 252/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 33b11e8eafe..d5bedc787f6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -155,7 +155,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.randint(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     weight_0 = simple_with_linear.linear.weight

From 79bd30345593dde19a3cfbefc1bbe22be5907577 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 05:00:25 +0000
Subject: [PATCH 253/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index d5bedc787f6..1355697b32d 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -155,7 +155,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.randint(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.randint(10, (10,), device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     weight_0 = simple_with_linear.linear.weight

From 08a155ede78326be59ba446c3592a9b9813e17f4 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 05:02:08 +0000
Subject: [PATCH 254/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1355697b32d..c8119721b19 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -155,7 +155,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.randint(10, (10,), device=xm.xla_device()) # input_value
+    l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     weight_0 = simple_with_linear.linear.weight

From 2840aefe6af8a5035c45103e763094cbaeb3749f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 17:09:58 +0000
Subject: [PATCH 255/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index c8119721b19..0b578ded851 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -113,6 +113,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
     l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
+    # l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))

From f144d25c3306a0ba45b89a56066b96a93fae6f3f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 17:17:43 +0000
Subject: [PATCH 256/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 0b578ded851..76ad459798e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -112,8 +112,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
-    # l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
+    # l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))

From 03743edbdc91df0739f1ffaab6258686041ad190 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 17:30:53 +0000
Subject: [PATCH 257/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 76ad459798e..e00f2ffeaa1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -95,8 +95,8 @@ def test_while_loop_tpu_simple_linear(self):
 
     # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    weight_0 = linear_0.weight
-    bias_0 = linear_0.bias
+    # weight_0 = linear_0.weight
+    # bias_0 = linear_0.bias
 
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
@@ -115,8 +115,10 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
     l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
+    # weight_0 = linear_0.weight
+    # bias_0 = linear_0.bias
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop( cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 

From 118e64023d783e47c9c66f467a44a312f575f6bf Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:00:07 +0000
Subject: [PATCH 258/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index e00f2ffeaa1..bbcfdb60d83 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -103,10 +103,12 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
 
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value_real = linear_0(input_value)
+      # output_value_real = linear_0(input_value)
+      output_value = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
+      # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value.clone(), weight.clone(), bias.clone() # , output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 915ca5a6126e5323340cb55cc2e7e8aa2cf37f3a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:01:17 +0000
Subject: [PATCH 259/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index bbcfdb60d83..1b771d20af1 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -108,7 +108,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
       # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value.clone(), weight.clone(), bias.clone() # , output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value.clone() # , output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 0bca16d922038a8e377612ea0102c1d341df86cf Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:02:36 +0000
Subject: [PATCH 260/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1b771d20af1..4f52e21dc6a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -108,7 +108,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
       # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), weight.clone(), bias.clone(), output_value.clone() # , output_value_real
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
 
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)

From 0b04e2819ec1f0065eb833750bd5f7f709200ad3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:03:19 +0000
Subject: [PATCH 261/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 4f52e21dc6a..173afeba744 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -126,6 +126,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
 
     print("torch_add_res__: ", torch_add_res__)
     print("output_value_real__: ", output_value_real__)
+    print("bias__: ", bias__)
     print("expected: ", expected)
 
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))

From 68e9aba51643232884dc216a898aa0a9086f928e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:15:33 +0000
Subject: [PATCH 262/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 173afeba744..58bc78c806e 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -110,7 +110,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
 
-    upper = torch.tensor([52], dtype=torch.int32, device=device)
+    upper = torch.tensor([1], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)

From 94d004582913e9e520112782550bd9b01d2d6f9b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:21:02 +0000
Subject: [PATCH 263/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 58bc78c806e..b4344c90d31 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -96,7 +96,8 @@ def test_while_loop_tpu_simple_linear(self):
     # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     # weight_0 = linear_0.weight
-    # bias_0 = linear_0.bias
+    bias_0 = linear_0.bias
+    print("original bias: ", bias_0)
 
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]

From b9192beb819839246c55d636bf1f75861cc95235 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:21:55 +0000
Subject: [PATCH 264/323] update

---
 torch_xla/experimental/fori_loop.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a69dd8d99df..b239d7eb56d 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -94,9 +94,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  # body_hlo_print = xb.get_computation_hlo(body_computation)
-  # print("body computation: !!!!!!!!!")
-  # print(body_hlo_print)
+  body_hlo_print = xb.get_computation_hlo(body_computation)
+  print("body computation: !!!!!!!!!")
+  print(body_hlo_print)
 
   ### trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
@@ -125,9 +125,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  hlo_print = xb.get_computation_hlo(computation)
-  print("while computation: !!!!!!!!!")
-  print(hlo_print)
+  # hlo_print = xb.get_computation_hlo(computation)
+  # print("while computation: !!!!!!!!!")
+  # print(hlo_print)
 
   ### gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',

From 1b594e60120dfda7e71c59cf7aba77ca57e9f0e1 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:29:42 +0000
Subject: [PATCH 265/323] update

---
 torch_xla/experimental/fori_loop.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index b239d7eb56d..5694a1ba1f0 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -64,11 +64,13 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
+  print("fake_carried_inputs: ", fake_carried_inputs)
 
   ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
+
   # !!! cond xlacomputation change !!! switch bias and weight position
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   tmp_bias = additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
@@ -87,6 +89,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_result = body_fn(*fake_carried_inputs)
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
+
   # !!! body xlacomputation change !!! add output_value argument
   additional_inputs_list_body = [fake_carried_inputs[-3]]
   ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body

From 867e488701ec4147331586cd948c16df4bb42a42 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:33:55 +0000
Subject: [PATCH 266/323] update

---
 torch_xla/experimental/fori_loop.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 5694a1ba1f0..e69323f1c8b 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -71,10 +71,16 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
 
-  # !!! cond xlacomputation change !!! switch bias and weight position
+  # # !!! cond xlacomputation change !!! switch bias and weight position
+  # additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  # tmp_bias = additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
+  # del additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
+  # additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
+
+  # !!! cond xlacomputation change !!! switch output_value and weight position
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
-  tmp_bias = additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
-  del additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
+  tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
+  del additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
   additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)

From 7a1bde9229c780fc6219710d0ac44057eeab2be9 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:35:20 +0000
Subject: [PATCH 267/323] update

---
 torch_xla/experimental/fori_loop.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index e69323f1c8b..c4923c76fe7 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -96,8 +96,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
 
-  # !!! body xlacomputation change !!! add output_value argument
+  # !!! body xlacomputation change !!! add non-changed output_value argument
   additional_inputs_list_body = [fake_carried_inputs[-3]]
+
   ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
@@ -120,10 +121,16 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
 
+  # ### TODO(@manfei): treat hard-code input arguments
+  # # !!! init change !!! 
+  # tmp_bias = params[-2]
+  # del params[-2]
+  # params.append(tmp_bias)
+
   ### TODO(@manfei): treat hard-code input arguments
-  # !!! init change !!! 
-  tmp_bias = params[-2]
-  del params[-2]
+  # !!! init change !!! switch bias and output_value
+  tmp_bias = params[-3]
+  del params[-3]
   params.append(tmp_bias)
 
   ### generate while xlacomputation

From 120c33598e00838e943bd7a82253a86076c93881 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:36:36 +0000
Subject: [PATCH 268/323] update

---
 ...oop_with_while_loop_simple_add_dispatch_in_torch.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index b4344c90d31..2ddb6af2f17 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -121,14 +121,14 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # weight_0 = linear_0.weight
     # bias_0 = linear_0.bias
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
-    print("torch_add_res__: ", torch_add_res__)
-    print("output_value_real__: ", output_value_real__)
-    print("bias__: ", bias__)
-    print("expected: ", expected)
+    # print("torch_add_res__: ", torch_add_res__)
+    # print("output_value_real__: ", output_value_real__)
+    # print("bias__: ", bias__)
+    # print("expected: ", expected)
 
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa

From bc5950a04ad2af1113f657f228e824fef5653b77 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:37:40 +0000
Subject: [PATCH 269/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 2ddb6af2f17..90d23dc2686 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -131,7 +131,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # print("expected: ", expected)
 
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
-    return aaa
+    return True
 
 # passed
   def test_while_loop_tpu_simple_linear_class(self):

From 91a3aa8a11230ce8ec8f3e4ad46871f2f898ea37 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:38:04 +0000
Subject: [PATCH 270/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 90d23dc2686..956acee8b1a 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -130,8 +130,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     # print("bias__: ", bias__)
     # print("expected: ", expected)
 
-    self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
-    return True
+    # self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
+    return self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
 
 # passed
   def test_while_loop_tpu_simple_linear_class(self):

From 7041bc32203a74a298f1a11730aa45629647b5cf Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 18:38:54 +0000
Subject: [PATCH 271/323] update

---
 torch_xla/experimental/fori_loop.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index c4923c76fe7..3e88a5f4446 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -41,8 +41,8 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   ### cond_fn&body_fn: callable
   ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
-  print("carried_inputs: ", carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+  # print("carried_inputs: ", carried_inputs)
+  # print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
@@ -64,7 +64,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
-  print("fake_carried_inputs: ", fake_carried_inputs)
+  # print("fake_carried_inputs: ", fake_carried_inputs)
 
   ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
@@ -104,9 +104,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  body_hlo_print = xb.get_computation_hlo(body_computation)
-  print("body computation: !!!!!!!!!")
-  print(body_hlo_print)
+  # body_hlo_print = xb.get_computation_hlo(body_computation)
+  # print("body computation: !!!!!!!!!")
+  # print(body_hlo_print)
 
   ### trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs

From 3ec8120e9e5f4261427d2da608ded09011edb6cc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 19:40:59 +0000
Subject: [PATCH 272/323] update

---
 ..._loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                      | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 956acee8b1a..fc803d252a3 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -86,7 +86,7 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
-# debugging
+# passed
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 3e88a5f4446..5918fd40900 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -23,9 +23,12 @@ def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
   def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
     new_lower = torch.add(one_value, lower)
     output_value = body_fun(*input_value)
-    weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
-    bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-    return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+    if (hasattr(body_fun, 'weight') and hasattr(body_fun, 'bias')):
+      weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
+      bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
+      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+    else:
+      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, output_value
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   weight_0 = body_fun.weight

From 9beac796744f17d1cea55556e843f3d503cb2444 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 19:42:15 +0000
Subject: [PATCH 273/323] update

---
 torch_xla/experimental/fori_loop.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 5918fd40900..d6832d2e532 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -31,10 +31,13 @@ def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, output_value
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
-  weight_0 = body_fun.weight
-  bias_0 = body_fun.bias
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
-  res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+  if (hasattr(body_fun, 'weight') and hasattr(body_fun, 'bias')):
+    weight_0 = body_fun.weight
+    bias_0 = body_fun.bias
+    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+  else:
+    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
   return res
 
 

From 70891937cfc91bc125f5f09d99ebb72195c0e001 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 19:44:22 +0000
Subject: [PATCH 274/323] update

---
 ...st_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 2 +-
 torch_xla/experimental/fori_loop.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index fc803d252a3..7ae046f696b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -97,7 +97,7 @@ def test_while_loop_tpu_simple_linear(self):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     # weight_0 = linear_0.weight
     bias_0 = linear_0.bias
-    print("original bias: ", bias_0)
+    # print("original bias: ", bias_0)
 
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index d6832d2e532..80e9b290848 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -70,7 +70,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
-  # print("fake_carried_inputs: ", fake_carried_inputs)
+  print("fake_carried_inputs: ", fake_carried_inputs)
 
   ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)

From 9d9bc32a84bffc10b884a0d30f744ce8b385fc87 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 19:50:11 +0000
Subject: [PATCH 275/323] update

---
 torch_xla/experimental/fori_loop.py | 30 +++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 80e9b290848..47b998857ff 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -16,28 +16,38 @@
 def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
+  
+  output_value = torch.zeros([20], dtype=torch.float32, device=device)
+  one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
   def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
     return lower[0] < upper[0]
 
-  def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
-    new_lower = torch.add(one_value, lower)
-    output_value = body_fun(*input_value)
-    if (hasattr(body_fun, 'weight') and hasattr(body_fun, 'bias')):
+  if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
+    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+      new_lower = torch.add(one_value, lower)
+      output_value = body_fun(*input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
       return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
-    else:
-      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, output_value
-
-  output_value = torch.zeros([20], dtype=torch.float32, device=device)
-  one_value = torch.tensor([1], dtype=torch.int32, device=device)
-  if (hasattr(body_fun, 'weight') and hasattr(body_fun, 'bias')):
     weight_0 = body_fun.weight
     bias_0 = body_fun.bias
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
   else:
+    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+      new_lower = torch.add(one_value, lower)
+      output_value = body_fun(*input_value)
+      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, output_value
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
+
+  # output_value = torch.zeros([20], dtype=torch.float32, device=device)
+  # one_value = torch.tensor([1], dtype=torch.int32, device=device)
+  # if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
+  #   weight_0 = body_fun.weight
+  #   bias_0 = body_fun.bias
+  #   res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+  # else:
+  #   res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
   return res
 
 

From 0d218c8e208247ff339b7011df4209234f3168c3 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 19:58:39 +0000
Subject: [PATCH 276/323] update

---
 torch_xla/experimental/fori_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 47b998857ff..bdc03d21981 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -24,6 +24,8 @@ def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bi
     return lower[0] < upper[0]
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
+    print("body_fun.weight: ", body_fun.weight)
+    print("body_fun.bias: ", body_fun.bias)
     def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(*input_value)

From bae4952783bfe1ac464297bd05c892041229be55 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:02:30 +0000
Subject: [PATCH 277/323] update

---
 torch_xla/experimental/fori_loop.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index bdc03d21981..b09632dee2a 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,21 +12,34 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 
 
+# ///////////////
+#     def cond_fn(upper, lower, one_value, x, input_value, output_value):
+#       return lower[0] < upper[0]
+
+#     def body_fn(upper, lower, one_value, x, input_value, output_value):
+#       new_lower = torch.add(one_value, lower)
+#       output_value = linear_0(input_value)
+#       weight = linear_0.weight
+#       bias = linear_0.bias
+#       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone()
+# ///////////////
+
+
 ### TODO(@manfei): delete one_value?
 def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
-  
+
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-  def cond_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+  def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
     return lower[0] < upper[0]
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     print("body_fun.weight: ", body_fun.weight)
     print("body_fun.bias: ", body_fun.bias)
-    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+    def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(*input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement

From 7bb4791dd8c2a2bb755f894fada8c9c6cb22704c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:07:04 +0000
Subject: [PATCH 278/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index b09632dee2a..73b9362f607 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -37,8 +37,8 @@ def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
     return lower[0] < upper[0]
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
-    print("body_fun.weight: ", body_fun.weight)
-    print("body_fun.bias: ", body_fun.bias)
+    # print("body_fun.weight: ", body_fun.weight)
+    # print("body_fun.bias: ", body_fun.bias)
     def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(*input_value)

From 114531545a22c2045010b58fc10e5a3faf71e172 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:11:19 +0000
Subject: [PATCH 279/323] update

---
 torch_xla/experimental/fori_loop.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 73b9362f607..af36c8d11c7 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -22,10 +22,22 @@
 #       weight = linear_0.weight
 #       bias = linear_0.bias
 #       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone()
+    # upper = torch.tensor([1], dtype=torch.int32, device=device)
+    # lower = torch.tensor([0], dtype=torch.int32, device=device)
+    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
+    # init_val = torch.tensor([1], dtype=torch.int32, device=device)
+    # # l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
+    # l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
+    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
+    # # weight_0 = linear_0.weight
+    # # bias_0 = linear_0.bias
+
+    # upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+
 # ///////////////
 
 
-### TODO(@manfei): delete one_value?
+### TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, *input_value):
 
   device = xm.xla_device()
@@ -41,13 +53,13 @@ def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
     # print("body_fun.bias: ", body_fun.bias)
     def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(*input_value)
+      output_value = body_fun(input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, weight, bias, output_value
+      return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
     weight_0 = body_fun.weight
     bias_0 = body_fun.bias
-    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
+    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
     def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)

From bad4c1fe3338a9dbaac1dd94093dd6785d4e6258 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:13:18 +0000
Subject: [PATCH 280/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index af36c8d11c7..2f603b427ae 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -84,8 +84,8 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   ### cond_fn&body_fn: callable
   ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
-  # print("carried_inputs: ", carried_inputs)
-  # print("additional_inputs: ", additional_inputs)
+  print("carried_inputs: ", carried_inputs)
+  print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)

From 00cd07c103cdd536f4f0032b302889d03c47a190 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:18:23 +0000
Subject: [PATCH 281/323] update

---
 torch_xla/experimental/fori_loop.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 2f603b427ae..b141717a157 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -45,12 +45,11 @@ def fori_loop(upper, lower, body_fun, init_val, *input_value):
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-  def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
-    return lower[0] < upper[0]
-
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
     # print("body_fun.bias: ", body_fun.bias)
+    def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+      return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
@@ -61,10 +60,12 @@ def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bia
     bias_0 = body_fun.bias
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
-    def body_fn(upper, lower, one_value, x, *input_value, weight_0, output_value, bias_0):
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
+      return lower[0] < upper[0]
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(*input_value)
-      return upper, new_lower, one_value, torch.add(one_value, x), *input_value, output_value
+      output_value = body_fun(input_value)
+      return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
 
   # output_value = torch.zeros([20], dtype=torch.float32, device=device)

From 629f42e444cdaa39926701c9d0048e52daa061d9 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:20:32 +0000
Subject: [PATCH 282/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index b141717a157..fe125f4e54e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -38,7 +38,7 @@
 
 
 ### TODO(@manfei): treat *input_value
-def fori_loop(upper, lower, body_fun, init_val, *input_value):
+def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
 

From d7bab3854838427ed407b8e87ecfb772952c83ef Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:25:18 +0000
Subject: [PATCH 283/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index fe125f4e54e..0436cb449a0 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -48,9 +48,9 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
     # print("body_fun.bias: ", body_fun.bias)
-    def cond_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
-    def body_fn(upper, lower, one_value, x, input_value, weight_0, output_value, bias_0):
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement

From 2e485a50cc49e40e5b4cd8b57b65a26122add77b Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:34:43 +0000
Subject: [PATCH 284/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0436cb449a0..ef5f2156043 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -56,8 +56,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-    weight_0 = body_fun.weight
-    bias_0 = body_fun.bias
+    # weight_0 = body_fun.weight
+    # bias_0 = body_fun.bias
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
     def cond_fn(upper, lower, one_value, x, input_value, output_value):

From 5ec118425c2300dfb9601e93aaba9301344d669c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:38:38 +0000
Subject: [PATCH 285/323] update

---
 torch_xla/experimental/fori_loop.py | 43 ++++++++++++-----------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index ef5f2156043..0073d9d1ccd 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,31 +12,6 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 
 
-# ///////////////
-#     def cond_fn(upper, lower, one_value, x, input_value, output_value):
-#       return lower[0] < upper[0]
-
-#     def body_fn(upper, lower, one_value, x, input_value, output_value):
-#       new_lower = torch.add(one_value, lower)
-#       output_value = linear_0(input_value)
-#       weight = linear_0.weight
-#       bias = linear_0.bias
-#       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone()
-    # upper = torch.tensor([1], dtype=torch.int32, device=device)
-    # lower = torch.tensor([0], dtype=torch.int32, device=device)
-    # one_value = torch.tensor([1], dtype=torch.int32, device=device)
-    # init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    # # l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
-    # l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
-    # output_value = torch.zeros([20], dtype=torch.float32, device=device)
-    # # weight_0 = linear_0.weight
-    # # bias_0 = linear_0.bias
-
-    # upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
-
-# ///////////////
-
-
 ### TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, input_value):
 
@@ -45,6 +20,21 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
+# /////////
+#     def cond_fn(upper, lower, one_value, x, input_value, output_value):
+#       return lower[0] < upper[0]
+
+#     def body_fn(upper, lower, one_value, x, input_value, output_value):
+#       new_lower = torch.add(one_value, lower)
+#       # output_value_real = linear_0(input_value)
+#       output_value = linear_0(input_value)
+#       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
+#       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
+#       # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
+#       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
+
+# /////////
+
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
     # print("body_fun.bias: ", body_fun.bias)
@@ -55,7 +45,8 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value = body_fun(input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-      return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+      # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # 
     # weight_0 = body_fun.weight
     # bias_0 = body_fun.bias
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))

From db3c8bfc11730a3df3ed0bf65f39f96da478b8bb Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:40:07 +0000
Subject: [PATCH 286/323] update

---
 ...ith_while_loop_simple_add_dispatch_in_torch.py |  2 +-
 torch_xla/experimental/fori_loop.py               | 15 ---------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 7ae046f696b..1cdc0e054af 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -96,7 +96,7 @@ def test_while_loop_tpu_simple_linear(self):
     # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     # weight_0 = linear_0.weight
-    bias_0 = linear_0.bias
+    # bias_0 = linear_0.bias
     # print("original bias: ", bias_0)
 
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0073d9d1ccd..15b12464d2c 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -20,21 +20,6 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-# /////////
-#     def cond_fn(upper, lower, one_value, x, input_value, output_value):
-#       return lower[0] < upper[0]
-
-#     def body_fn(upper, lower, one_value, x, input_value, output_value):
-#       new_lower = torch.add(one_value, lower)
-#       # output_value_real = linear_0(input_value)
-#       output_value = linear_0(input_value)
-#       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
-#       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-#       # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
-#       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
-
-# /////////
-
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
     # print("body_fun.bias: ", body_fun.bias)

From 14f056962e16a2c9c0d7d7d3141d9c4dda52c179 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:41:45 +0000
Subject: [PATCH 287/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 15b12464d2c..c2ba37f2473 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -16,6 +16,7 @@
 def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
+  body_fun = torch.nn.Linear(10, 20).to(xm.xla_device())
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)

From d07015a03effef894117ad1075bbe93e9e19fa35 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 20:46:33 +0000
Subject: [PATCH 288/323] update

---
 torch_xla/experimental/fori_loop.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index c2ba37f2473..40992de631e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -16,11 +16,25 @@
 def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
-  body_fun = torch.nn.Linear(10, 20).to(xm.xla_device())
+  # body_fun = torch.nn.Linear(10, 20).to(xm.xla_device())
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
+  def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    return lower[0] < upper[0]
+  def body_fn(upper, lower, one_value, x, input_value, output_value):
+    new_lower = torch.add(one_value, lower)
+    output_value = body_fun(input_value)
+    weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
+    bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
+    # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+    return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
+  # weight_0 = body_fun.weight
+  # bias_0 = body_fun.bias
+  res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+  return res
+
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
     # print("body_fun.bias: ", body_fun.bias)
@@ -32,7 +46,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
       # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # 
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     # weight_0 = body_fun.weight
     # bias_0 = body_fun.bias
     res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
@@ -43,7 +57,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
-    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
+    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
   # output_value = torch.zeros([20], dtype=torch.float32, device=device)
   # one_value = torch.tensor([1], dtype=torch.int32, device=device)

From 55061c3a17f15d342ae8a0cd13b29b75eae70502 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:00:05 +0000
Subject: [PATCH 289/323] update

---
 torch_xla/experimental/fori_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 40992de631e..77eb4485f78 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -10,6 +10,7 @@
 from torch._ops import HigherOrderOperator
 import torch._higher_order_ops.while_loop
 from torch._higher_order_ops.while_loop import while_loop_op
+from torch._higher_order_ops.while_loop import while_loop
 
 
 ### TODO(@manfei): treat *input_value

From f1a596a724d12c086d2f1826f12931ddad5ea391 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:01:28 +0000
Subject: [PATCH 290/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 77eb4485f78..e634fb797a2 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -10,7 +10,7 @@
 from torch._ops import HigherOrderOperator
 import torch._higher_order_ops.while_loop
 from torch._higher_order_ops.while_loop import while_loop_op
-from torch._higher_order_ops.while_loop import while_loop
+from torch._higher_order_ops.while_loop import while_loop as torch_while_loop
 
 
 ### TODO(@manfei): treat *input_value
@@ -33,7 +33,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
   # weight_0 = body_fun.weight
   # bias_0 = body_fun.bias
-  res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+  res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   return res
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):

From 3818380dcdfc6f3fe2abe309a617d93c1e9ba671 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:02:56 +0000
Subject: [PATCH 291/323] update

---
 torch_xla/experimental/fori_loop.py | 30 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index e634fb797a2..2ecb88fc1cc 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -22,19 +22,19 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-  def cond_fn(upper, lower, one_value, x, input_value, output_value):
-    return lower[0] < upper[0]
-  def body_fn(upper, lower, one_value, x, input_value, output_value):
-    new_lower = torch.add(one_value, lower)
-    output_value = body_fun(input_value)
-    weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
-    bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-    # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-    return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
-  # weight_0 = body_fun.weight
-  # bias_0 = body_fun.bias
-  res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
-  return res
+  # def cond_fn(upper, lower, one_value, x, input_value, output_value):
+  #   return lower[0] < upper[0]
+  # def body_fn(upper, lower, one_value, x, input_value, output_value):
+  #   new_lower = torch.add(one_value, lower)
+  #   output_value = body_fun(input_value)
+  #   weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
+  #   bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
+  #   # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
+  #   return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
+  # # weight_0 = body_fun.weight
+  # # bias_0 = body_fun.bias
+  # res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+  # return res
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     # print("body_fun.weight: ", body_fun.weight)
@@ -50,7 +50,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     # weight_0 = body_fun.weight
     # bias_0 = body_fun.bias
-    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
@@ -58,7 +58,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
-    res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
   # output_value = torch.zeros([20], dtype=torch.float32, device=device)
   # one_value = torch.tensor([1], dtype=torch.int32, device=device)

From b70a34d5f2e0e203821828dadd9b6c73203625a9 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:03:48 +0000
Subject: [PATCH 292/323] update

---
 torch_xla/experimental/fori_loop.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 2ecb88fc1cc..cf94532ca15 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -17,28 +17,11 @@
 def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
-  # body_fun = torch.nn.Linear(10, 20).to(xm.xla_device())
 
   output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
-  # def cond_fn(upper, lower, one_value, x, input_value, output_value):
-  #   return lower[0] < upper[0]
-  # def body_fn(upper, lower, one_value, x, input_value, output_value):
-  #   new_lower = torch.add(one_value, lower)
-  #   output_value = body_fun(input_value)
-  #   weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
-  #   bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-  #   # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
-  #   return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
-  # # weight_0 = body_fun.weight
-  # # bias_0 = body_fun.bias
-  # res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
-  # return res
-
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
-    # print("body_fun.weight: ", body_fun.weight)
-    # print("body_fun.bias: ", body_fun.bias)
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
@@ -46,10 +29,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       output_value = body_fun(input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
-      # return upper, new_lower, one_value, torch.add(one_value, x), input_value, weight, bias, output_value
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
-    # weight_0 = body_fun.weight
-    # bias_0 = body_fun.bias
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
@@ -60,14 +40,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
-  # output_value = torch.zeros([20], dtype=torch.float32, device=device)
-  # one_value = torch.tensor([1], dtype=torch.int32, device=device)
-  # if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
-  #   weight_0 = body_fun.weight
-  #   bias_0 = body_fun.bias
-  #   res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, weight_0, bias_0, output_value))
-  # else:
-  #   res = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, *input_value, output_value))
   return res
 
 

From 26d2fdb57486ff7cd246af243aa72518a31a78b2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:06:46 +0000
Subject: [PATCH 293/323] update

---
 ...ri_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 torch_xla/experimental/fori_loop.py                        | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1cdc0e054af..322829a1e61 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -198,6 +198,7 @@ def body_fun(a, b):
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     self.assertEqual(expected, res_)
 
+# passed
   def test_fori_loop_tpu_simple_linear(self):
 
     xm.mark_step()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index cf94532ca15..3aac50599a0 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -87,9 +87,10 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
 
   # !!! cond xlacomputation change !!! switch output_value and weight position
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
-  tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
-  del additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
-  additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
+  if additional_inputs:
+    tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
+    del additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
+    additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()

From c8e39fe8f883c565e1cb1a51201ee5ddbc03d49c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:07:45 +0000
Subject: [PATCH 294/323] update

---
 torch_xla/experimental/fori_loop.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 3aac50599a0..7140bafb039 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -106,7 +106,10 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_ctx.set_name_string("bodyctx")
 
   # !!! body xlacomputation change !!! add non-changed output_value argument
-  additional_inputs_list_body = [fake_carried_inputs[-3]]
+  if additional_inputs:
+    additional_inputs_list_body = [fake_carried_inputs[-3]]
+  else:
+    additional_inputs_list_body = []
 
   ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)

From 12371764f300045e96241e112b4a8b02c2df5bb0 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:08:49 +0000
Subject: [PATCH 295/323] update

---
 torch_xla/experimental/fori_loop.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 7140bafb039..4c2920bbf1e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -141,9 +141,10 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
 
   ### TODO(@manfei): treat hard-code input arguments
   # !!! init change !!! switch bias and output_value
-  tmp_bias = params[-3]
-  del params[-3]
-  params.append(tmp_bias)
+  if additional_inputs:
+    tmp_bias = params[-3]
+    del params[-3]
+    params.append(tmp_bias)
 
   ### generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))

From 620d6270e71782833a84987cd9ba693739a2775a Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:09:51 +0000
Subject: [PATCH 296/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4c2920bbf1e..f5af1b8c967 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -85,7 +85,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # del additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
   # additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
 
-  # !!! cond xlacomputation change !!! switch output_value and weight position
+  # !!! cond xlacomputation change !!! switch output_value and weight position if additional_inputs(weight/bias) exists
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:
     tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
@@ -105,7 +105,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
 
-  # !!! body xlacomputation change !!! add non-changed output_value argument
+  # !!! body xlacomputation change !!! add non-changed output_value argument if additional_inputs(weight/bias) exists
   if additional_inputs:
     additional_inputs_list_body = [fake_carried_inputs[-3]]
   else:
@@ -140,7 +140,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # params.append(tmp_bias)
 
   ### TODO(@manfei): treat hard-code input arguments
-  # !!! init change !!! switch bias and output_value
+  # !!! init change !!! switch bias and output_value if additional_inputs(weight/bias) exists
   if additional_inputs:
     tmp_bias = params[-3]
     del params[-3]

From cfa5f7e932a364bfb52eac25ec46e68b7e5019c2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:10:06 +0000
Subject: [PATCH 297/323] update

---
 torch_xla/experimental/fori_loop.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index f5af1b8c967..ed6bf8438c5 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -79,12 +79,6 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
 
-  # # !!! cond xlacomputation change !!! switch bias and weight position
-  # additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
-  # tmp_bias = additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
-  # del additional_inputs_list_cond[-2] ### not used, change order doesn't affect logic
-  # additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
-
   # !!! cond xlacomputation change !!! switch output_value and weight position if additional_inputs(weight/bias) exists
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:

From 0cb1dce13221b29b5da139facfb8712a7b817ddc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:13:41 +0000
Subject: [PATCH 298/323] update

---
 ...ori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 6 ++++--
 torch_xla/experimental/fori_loop.py                         | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 322829a1e61..84dfe3e11d8 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -32,6 +32,7 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
 
 class WhileLoopTest(unittest.TestCase):
 
+# passed
   def test_while_loop_tpu_subtraction(self):
 
     device = xm.xla_device()
@@ -50,6 +51,7 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+# passed
   def test_while_loop_tpu_addition(self):
 
     device = xm.xla_device()
@@ -68,6 +70,7 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
+# passed
   def test_while_loop_tpu_subtraction_nested(self):
 
     device = xm.xla_device()
@@ -193,8 +196,7 @@ def test_fori_loop_tpu_addition(self):
     def body_fun(a, b):
       return torch.add(a, b)
 
-    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value,
-                                     init_val)
+    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     self.assertEqual(expected, res_)
 
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index ed6bf8438c5..e99953c9ec4 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -26,7 +26,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(input_value)
+      output_value = body_fun(input_value, one_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 

From 2dcfab0790dd95b103a2805e690d1fd3c71dc473 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:14:21 +0000
Subject: [PATCH 299/323] update

---
 torch_xla/experimental/fori_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index e99953c9ec4..da166177d01 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -26,7 +26,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(input_value, one_value)
+      output_value = body_fun(input_value)
       weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
@@ -36,7 +36,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(input_value)
+      output_value = body_fun(input_value, one_value)
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 

From d0270b6cf80720b099f5853fc2e4facace86baff Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:14:57 +0000
Subject: [PATCH 300/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index da166177d01..5c66d48010e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -36,7 +36,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(input_value, one_value)
+      output_value = body_fun(one_value, input_value)
       return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 

From 2377b8b12e85d67c1a23759dd0801fa4093843fc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:23:16 +0000
Subject: [PATCH 301/323] update

---
 torch_xla/experimental/fori_loop.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 5c66d48010e..4c25db6515c 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,6 +12,32 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 from torch._higher_order_ops.while_loop import while_loop as torch_while_loop
 
+/////////
+  def test_while_loop_tpu_addition(self):
+    device = xm.xla_device()
+    def cond_fn(init, limit_value):
+      return limit_value[0] >= init[0]
+    def body_fn(init, limit_value):
+      one_value = torch.ones(1, dtype=torch.int32, device=device)
+      return (torch.add(init, one_value), limit_value.clone())
+    # TODO(@manfei): init and limit_value has to be torch.tensor.
+    init = torch.tensor([0], dtype=torch.int32, device=device)
+    limit_value = torch.tensor([10], dtype=torch.int32, device=device)
+    res = while_loop(cond_fn, body_fn, (init, limit_value))
+/////////
+def fori_loop(lower, upper, user_body_func, *init_val):
+  device = xm.xla_device()
+  def cond_fn(upper, lower, *init_val):
+    return lower[0] < upper[0]
+  def body_fn(upper, lower, *init_val):
+    one_value_i = torch.ones(1, dtype=torch.int32, device=device)
+    res_list = list(user_body_func(*init_val))
+    res_list.insert(0, lower)
+    res_list.insert(0, torch.sub(upper, one_value_i))
+    return res_list
+  res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
+  return res
+/////////
 
 ### TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, input_value):
@@ -37,7 +63,7 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(one_value, input_value)
-      return upper, new_lower, one_value, torch.add(one_value, x), input_value, output_value
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value.clone()
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
   return res

From 8947db7e6ec4b0233a0532f1c7b2637107400b8c Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:23:40 +0000
Subject: [PATCH 302/323] update

---
 torch_xla/experimental/fori_loop.py | 52 ++++++++++++++---------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 4c25db6515c..eef46f7f0e6 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,32 +12,32 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 from torch._higher_order_ops.while_loop import while_loop as torch_while_loop
 
-/////////
-  def test_while_loop_tpu_addition(self):
-    device = xm.xla_device()
-    def cond_fn(init, limit_value):
-      return limit_value[0] >= init[0]
-    def body_fn(init, limit_value):
-      one_value = torch.ones(1, dtype=torch.int32, device=device)
-      return (torch.add(init, one_value), limit_value.clone())
-    # TODO(@manfei): init and limit_value has to be torch.tensor.
-    init = torch.tensor([0], dtype=torch.int32, device=device)
-    limit_value = torch.tensor([10], dtype=torch.int32, device=device)
-    res = while_loop(cond_fn, body_fn, (init, limit_value))
-/////////
-def fori_loop(lower, upper, user_body_func, *init_val):
-  device = xm.xla_device()
-  def cond_fn(upper, lower, *init_val):
-    return lower[0] < upper[0]
-  def body_fn(upper, lower, *init_val):
-    one_value_i = torch.ones(1, dtype=torch.int32, device=device)
-    res_list = list(user_body_func(*init_val))
-    res_list.insert(0, lower)
-    res_list.insert(0, torch.sub(upper, one_value_i))
-    return res_list
-  res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
-  return res
-/////////
+# /////////
+#   def test_while_loop_tpu_addition(self):
+#     device = xm.xla_device()
+#     def cond_fn(init, limit_value):
+#       return limit_value[0] >= init[0]
+#     def body_fn(init, limit_value):
+#       one_value = torch.ones(1, dtype=torch.int32, device=device)
+#       return (torch.add(init, one_value), limit_value.clone())
+#     # TODO(@manfei): init and limit_value has to be torch.tensor.
+#     init = torch.tensor([0], dtype=torch.int32, device=device)
+#     limit_value = torch.tensor([10], dtype=torch.int32, device=device)
+#     res = while_loop(cond_fn, body_fn, (init, limit_value))
+# /////////
+# def fori_loop(lower, upper, user_body_func, *init_val):
+#   device = xm.xla_device()
+#   def cond_fn(upper, lower, *init_val):
+#     return lower[0] < upper[0]
+#   def body_fn(upper, lower, *init_val):
+#     one_value_i = torch.ones(1, dtype=torch.int32, device=device)
+#     res_list = list(user_body_func(*init_val))
+#     res_list.insert(0, lower)
+#     res_list.insert(0, torch.sub(upper, one_value_i))
+#     return res_list
+#   res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
+#   return res
+# /////////
 
 ### TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, input_value):

From 0fbb23d0049d2f61ee9e9b728ee552d8a9f98200 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:27:25 +0000
Subject: [PATCH 303/323] update

---
 torch_xla/experimental/fori_loop.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index eef46f7f0e6..a492e56a462 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -58,13 +58,13 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
-    def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    def cond_fn(upper, lower, one_value, x, input_value):
       return lower[0] < upper[0]
-    def body_fn(upper, lower, one_value, x, input_value, output_value):
+    def body_fn(upper, lower, one_value, x, input_value):
       new_lower = torch.add(one_value, lower)
-      output_value = body_fun(one_value, input_value)
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value.clone()
-    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+      output_val = body_fun(one_value, input_value)
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), output_val.clone()
+    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value))
 
   return res
 

From 53f818564c0a46ee18d3395b0c3dde0402318f38 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:29:29 +0000
Subject: [PATCH 304/323] update

---
 ...i_loop_with_while_loop_simple_add_dispatch_in_torch.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 84dfe3e11d8..ad14be19f40 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -196,7 +196,13 @@ def test_fori_loop_tpu_addition(self):
     def body_fun(a, b):
       return torch.add(a, b)
 
-    lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    # lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    upper_, new_lower_, one_value_, add_res_x_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    print("upper_: ", upper_)
+    print("new_lower_: ", new_lower_)
+    print("one_value_: ", one_value_)
+    print("add_res_x_: ", add_res_x_)
+    print("res_: ", res_)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     self.assertEqual(expected, res_)
 

From 52435ecbf6152d10471b06d0189f3b79528b98db Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:33:18 +0000
Subject: [PATCH 305/323] update

---
 torch_xla/experimental/fori_loop.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a492e56a462..0696977b069 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -44,10 +44,11 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
 
-  output_value = torch.zeros([20], dtype=torch.float32, device=device)
+  # output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
+    output_value = torch.zeros([20], dtype=torch.float32, device=device)
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
     def body_fn(upper, lower, one_value, x, input_value, output_value):
@@ -58,13 +59,15 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
-    def cond_fn(upper, lower, one_value, x, input_value):
+    # output_value = torch.zeros([1], dtype=torch.float32, device=device)
+    output_value = torch.tensor([1], dtype=torch.int32, device=device)
+    def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
-    def body_fn(upper, lower, one_value, x, input_value):
+    def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_val = body_fun(one_value, input_value)
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), output_val.clone()
-    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value))
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_val.clone()
+    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
 
   return res
 

From 116a68bc9b19ac61ae60e92af25a08fea41a0256 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:39:13 +0000
Subject: [PATCH 306/323] update

---
 torch_xla/experimental/fori_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0696977b069..950e26d4e3e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -132,7 +132,8 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   if additional_inputs:
     additional_inputs_list_body = [fake_carried_inputs[-3]]
   else:
-    additional_inputs_list_body = []
+    # add fake output_value to do map and not reuse output in the next turn
+    additional_inputs_list_body = [fake_carried_inputs[-1]]
 
   ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)

From 3cb631a21bd1d952d9a3d089dac73882de486714 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:40:53 +0000
Subject: [PATCH 307/323] update

---
 ..._fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 4 ++++
 torch_xla/experimental/fori_loop.py                           | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index ad14be19f40..9a49d6b5c2b 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -49,6 +49,8 @@ def body_fn(init, limit_value):
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
+    print("expected: ", expected)
+    print("res: ", res)
     self.assertEqual(expected, res)
 
 # passed
@@ -69,6 +71,8 @@ def body_fn(init, limit_value):
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
+    print("expected: ", expected)
+    print("res: ", res)
 
 # passed
   def test_while_loop_tpu_subtraction_nested(self):
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 950e26d4e3e..0696977b069 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -132,8 +132,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   if additional_inputs:
     additional_inputs_list_body = [fake_carried_inputs[-3]]
   else:
-    # add fake output_value to do map and not reuse output in the next turn
-    additional_inputs_list_body = [fake_carried_inputs[-1]]
+    additional_inputs_list_body = []
 
   ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)

From fe3a5302d314630af0dfcb3d01f1338acc50238f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:42:22 +0000
Subject: [PATCH 308/323] update

---
 torch_xla/experimental/fori_loop.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 0696977b069..32513a59936 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -61,13 +61,13 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
   else:
     # output_value = torch.zeros([1], dtype=torch.float32, device=device)
     output_value = torch.tensor([1], dtype=torch.int32, device=device)
-    def cond_fn(upper, lower, one_value, x, input_value, output_value):
+    def cond_fn(upper, lower, one_value, x, input_value): # , output_value):
       return lower[0] < upper[0]
-    def body_fn(upper, lower, one_value, x, input_value, output_value):
+    def body_fn(upper, lower, one_value, x, input_value): # , output_value):
       new_lower = torch.add(one_value, lower)
       output_val = body_fun(one_value, input_value)
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_val.clone()
-    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), output_val.clone()
+    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value))
 
   return res
 

From ac574f35f9be4630e532f8d8e8961612c077de7e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:44:08 +0000
Subject: [PATCH 309/323] update

---
 ...est_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 9a49d6b5c2b..6d7009b04ce 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -208,6 +208,7 @@ def body_fun(a, b):
     print("add_res_x_: ", add_res_x_)
     print("res_: ", res_)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
+    print("expected: ", expected)
     self.assertEqual(expected, res_)
 
 # passed

From ca3e7576e3885c103cb8d40cac737df13bc4450f Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:46:51 +0000
Subject: [PATCH 310/323] update

---
 ...t_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 6d7009b04ce..1cacbdee9f6 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -23,7 +23,8 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
   if len(init_val) > 1:
     (a, b) = init_val
     for i in range((upper - lower)[0]):
-      a = body_fun(*init_val)
+      # a = body_fun(*init_val)
+      a = body_fun(a, b)
   else:
     for i in range((upper - lower)[0]):
       a = body_fun(*init_val)

From 18194202a1444db45602395fa544eb51aef54f47 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:49:25 +0000
Subject: [PATCH 311/323] update

---
 ...while_loop_simple_add_dispatch_in_torch.py | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 1cacbdee9f6..29ae7e108c0 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -50,8 +50,8 @@ def body_fn(init, limit_value):
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
-    print("expected: ", expected)
-    print("res: ", res)
+    # print("expected: ", expected)
+    # print("res: ", res)
     self.assertEqual(expected, res)
 
 # passed
@@ -72,8 +72,8 @@ def body_fn(init, limit_value):
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
-    print("expected: ", expected)
-    print("res: ", res)
+    # print("expected: ", expected)
+    # print("res: ", res)
 
 # passed
   def test_while_loop_tpu_subtraction_nested(self):
@@ -188,6 +188,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa
 
+# passed
   def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()
@@ -203,13 +204,13 @@ def body_fun(a, b):
 
     # lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
     upper_, new_lower_, one_value_, add_res_x_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
-    print("upper_: ", upper_)
-    print("new_lower_: ", new_lower_)
-    print("one_value_: ", one_value_)
-    print("add_res_x_: ", add_res_x_)
-    print("res_: ", res_)
+    # print("upper_: ", upper_)
+    # print("new_lower_: ", new_lower_)
+    # print("one_value_: ", one_value_)
+    # print("add_res_x_: ", add_res_x_)
+    # print("res_: ", res_)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
-    print("expected: ", expected)
+    # print("expected: ", expected)
     self.assertEqual(expected, res_)
 
 # passed

From 1b91fc82a12eb429e35f2844c35967ff87491910 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:49:51 +0000
Subject: [PATCH 312/323] update

---
 torch_xla/experimental/fori_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 32513a59936..863ba0c6d1f 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -78,8 +78,8 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   ### cond_fn&body_fn: callable
   ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
-  print("carried_inputs: ", carried_inputs)
-  print("additional_inputs: ", additional_inputs)
+  # print("carried_inputs: ", carried_inputs)
+  # print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
@@ -101,7 +101,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
-  print("fake_carried_inputs: ", fake_carried_inputs)
+  # print("fake_carried_inputs: ", fake_carried_inputs)
 
   ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)

From 673145d105d15c965bf72bbe8bc160c7df6307ab Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 21:50:44 +0000
Subject: [PATCH 313/323] update

---
 torch_xla/experimental/fori_loop.py | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 863ba0c6d1f..1c4394d73bd 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -12,32 +12,6 @@
 from torch._higher_order_ops.while_loop import while_loop_op
 from torch._higher_order_ops.while_loop import while_loop as torch_while_loop
 
-# /////////
-#   def test_while_loop_tpu_addition(self):
-#     device = xm.xla_device()
-#     def cond_fn(init, limit_value):
-#       return limit_value[0] >= init[0]
-#     def body_fn(init, limit_value):
-#       one_value = torch.ones(1, dtype=torch.int32, device=device)
-#       return (torch.add(init, one_value), limit_value.clone())
-#     # TODO(@manfei): init and limit_value has to be torch.tensor.
-#     init = torch.tensor([0], dtype=torch.int32, device=device)
-#     limit_value = torch.tensor([10], dtype=torch.int32, device=device)
-#     res = while_loop(cond_fn, body_fn, (init, limit_value))
-# /////////
-# def fori_loop(lower, upper, user_body_func, *init_val):
-#   device = xm.xla_device()
-#   def cond_fn(upper, lower, *init_val):
-#     return lower[0] < upper[0]
-#   def body_fn(upper, lower, *init_val):
-#     one_value_i = torch.ones(1, dtype=torch.int32, device=device)
-#     res_list = list(user_body_func(*init_val))
-#     res_list.insert(0, lower)
-#     res_list.insert(0, torch.sub(upper, one_value_i))
-#     return res_list
-#   res = while_loop(cond_fn, body_fn, (lower, upper, *init_val))
-#   return res
-# /////////
 
 ### TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, input_value):
@@ -77,7 +51,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   ### TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   ### cond_fn&body_fn: callable
   ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
-  print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
+  # print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
   # print("carried_inputs: ", carried_inputs)
   # print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:

From 12f6f71716785bc2fd7940aa947f28839d074a25 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:00:51 +0000
Subject: [PATCH 314/323] update

---
 torch_xla/experimental/fori_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 1c4394d73bd..80bc209aacf 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -60,7 +60,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
 
 
 def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
-  print("arrive here _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None): !!!")
+  # print("arrive here _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None): !!!")
   ### fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:

From 89605cbb6b91336c3012bc715b0ae870eaa26f2e Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:38:43 +0000
Subject: [PATCH 315/323] format

---
 ...while_loop_simple_add_dispatch_in_torch.py | 37 +------------------
 torch_xla/experimental/fori_loop.py           | 36 +++---------------
 2 files changed, 7 insertions(+), 66 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 29ae7e108c0..73d55d3dc47 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -23,7 +23,6 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
   if len(init_val) > 1:
     (a, b) = init_val
     for i in range((upper - lower)[0]):
-      # a = body_fun(*init_val)
       a = body_fun(a, b)
   else:
     for i in range((upper - lower)[0]):
@@ -33,7 +32,6 @@ def _fake_fori_loop(lower, upper, body_fun, *init_val):
 
 class WhileLoopTest(unittest.TestCase):
 
-# passed
   def test_while_loop_tpu_subtraction(self):
 
     device = xm.xla_device()
@@ -50,11 +48,8 @@ def body_fn(init, limit_value):
     limit_value = torch.tensor([0], dtype=torch.int32, device=device)
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
-    # print("expected: ", expected)
-    # print("res: ", res)
     self.assertEqual(expected, res)
 
-# passed
   def test_while_loop_tpu_addition(self):
 
     device = xm.xla_device()
@@ -72,10 +67,7 @@ def body_fn(init, limit_value):
     res = while_loop(cond_fn, body_fn, (init, limit_value))
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
-    # print("expected: ", expected)
-    # print("res: ", res)
 
-# passed
   def test_while_loop_tpu_subtraction_nested(self):
 
     device = xm.xla_device()
@@ -94,54 +86,37 @@ def body_fn(init, limit_value):
     expected = _fake_while_loop(cond_fn, body_fn, (init, limit_value))
     self.assertEqual(expected, res)
 
-# passed
   def test_while_loop_tpu_simple_linear(self):
 
     xm.mark_step()
     device = xm.xla_device()
     torch.set_grad_enabled(False)
 
-    # def forward(self, upper, lower, one_value, x, input_value, output_value):
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
-    # weight_0 = linear_0.weight
-    # bias_0 = linear_0.bias
-    # print("original bias: ", bias_0)
 
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
 
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
-      # output_value_real = linear_0(input_value)
       output_value = linear_0(input_value)
       weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
       bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      # return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real.clone(), weight.clone(), bias.clone() # , output_value_real
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
 
     upper = torch.tensor([1], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    # l_in_0 = torch.ones(10, device=xm.xla_device()) # input_value
-    l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.rand(10, device=xm.xla_device())
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
-    # weight_0 = linear_0.weight
-    # bias_0 = linear_0.bias
 
     upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
-    # print("torch_add_res__: ", torch_add_res__)
-    # print("output_value_real__: ", output_value_real__)
-    # print("bias__: ", bias__)
-    # print("expected: ", expected)
-
-    # self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
 
-# passed
   def test_while_loop_tpu_simple_linear_class(self):
 
     xm.mark_step()
@@ -177,6 +152,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     bias_0 = simple_with_linear.linear.bias
 
     aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
 
     # same weight/bias liear model
@@ -188,7 +164,6 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     self.assertTrue(torch.all(torch.eq(expected, output_value_real__)))
     return aaa
 
-# passed
   def test_fori_loop_tpu_addition(self):
 
     xm.mark_step()
@@ -202,18 +177,10 @@ def test_fori_loop_tpu_addition(self):
     def body_fun(a, b):
       return torch.add(a, b)
 
-    # lower_, upper_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
     upper_, new_lower_, one_value_, add_res_x_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
-    # print("upper_: ", upper_)
-    # print("new_lower_: ", new_lower_)
-    # print("one_value_: ", one_value_)
-    # print("add_res_x_: ", add_res_x_)
-    # print("res_: ", res_)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
-    # print("expected: ", expected)
     self.assertEqual(expected, res_)
 
-# passed
   def test_fori_loop_tpu_simple_linear(self):
 
     xm.mark_step()
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index 80bc209aacf..da1ebb4a0c9 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -18,7 +18,6 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
 
-  # output_value = torch.zeros([20], dtype=torch.float32, device=device)
   one_value = torch.tensor([1], dtype=torch.int32, device=device)
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
@@ -33,11 +32,10 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
-    # output_value = torch.zeros([1], dtype=torch.float32, device=device)
     output_value = torch.tensor([1], dtype=torch.int32, device=device)
-    def cond_fn(upper, lower, one_value, x, input_value): # , output_value):
+    def cond_fn(upper, lower, one_value, x, input_value):
       return lower[0] < upper[0]
-    def body_fn(upper, lower, one_value, x, input_value): # , output_value):
+    def body_fn(upper, lower, one_value, x, input_value):
       new_lower = torch.add(one_value, lower)
       output_val = body_fun(one_value, input_value)
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), output_val.clone()
@@ -51,38 +49,31 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   ### TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
   ### cond_fn&body_fn: callable
   ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
-  # print("arrive here @while_loop_op.py_impl(DispatchKey.XLA) !!!")
-  # print("carried_inputs: ", carried_inputs)
-  # print("additional_inputs: ", additional_inputs)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
 
 
 def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
-  # print("arrive here _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None): !!!")
   ### fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
-    ###TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, carried_input.size(),
                       dtype=carried_input.dtype).to(device))
   for additional_input in additional_inputs:
     device = additional_input.device
-    ###TODO(@manfei) type = carried_input.type
     fake_carried_inputs.append(
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
-  # print("fake_carried_inputs: ", fake_carried_inputs)
 
   ### TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
 
-  # !!! cond xlacomputation change !!! switch output_value and weight position if additional_inputs(weight/bias) exists
+  ### TODO(@manfei): treat hard-code cond xlacomputation change: currently switch output_value and weight position if additional_inputs(weight/bias) exists
   additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:
     tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
@@ -93,16 +84,13 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
-  # cond_hlo_print = xb.get_computation_hlo(cond_computation)
-  # print("cond computation: !!!!!!!!!")
-  # print(cond_hlo_print)
 
   ### generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs)
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
 
-  # !!! body xlacomputation change !!! add non-changed output_value argument if additional_inputs(weight/bias) exists
+  ### TODO(@manfei): treat hard-code body xlacomputation change: currently add non-changed output_value argument if additional_inputs(weight/bias) exists
   if additional_inputs:
     additional_inputs_list_body = [fake_carried_inputs[-3]]
   else:
@@ -113,9 +101,6 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
-  # body_hlo_print = xb.get_computation_hlo(body_computation)
-  # print("body computation: !!!!!!!!!")
-  # print(body_hlo_print)
 
   ### trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
@@ -130,14 +115,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
 
-  # ### TODO(@manfei): treat hard-code input arguments
-  # # !!! init change !!! 
-  # tmp_bias = params[-2]
-  # del params[-2]
-  # params.append(tmp_bias)
-
-  ### TODO(@manfei): treat hard-code input arguments
-  # !!! init change !!! switch bias and output_value if additional_inputs(weight/bias) exists
+  ### TODO(@manfei): treat hard-code input arguments, currently switch bias and output_value if additional_inputs(weight/bias) exists
   if additional_inputs:
     tmp_bias = params[-3]
     del params[-3]
@@ -151,14 +129,10 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
       body_computation=body_computation)
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
-  # hlo_print = xb.get_computation_hlo(computation)
-  # print("while computation: !!!!!!!!!")
-  # print(hlo_print)
 
   ### gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (total_inputs),
                                                  computation)
-  ### print("!!! arrive here too after while!!!")
 
   return result
\ No newline at end of file

From 2e9c979997e146b51a420ecc61d6a1b2ce93d4b2 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:44:27 +0000
Subject: [PATCH 316/323] format

---
 torch_xla/csrc/init_python_bindings.cpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index c7013569dd1..f48dcf9eb68 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -914,18 +914,10 @@ class PyLoweringContext {
   // needed in xlacomputation.
   void BuildForiLoop(std::vector<at::Tensor> tensors,
                      std::vector<at::Tensor> additional_inputs_list = {}) {
+    // hard-code modify cond xlacomputation input arguments with unusedarguments for xla::while requriement
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      // hard-code parameter_idx to 2 to skip existing upper/lower arguments
-      // TODO(@manfei): get body xlacomputation arguments' number first then decide items in `additional_inputs_list`, maybe implement in python level
-      // !!! since cond_fn only compare upper and lower, so it would only use two arguments, due to PyTorch/XLA
-      // !!! trace xlacomputation from result tensor, so all the other arguments would not be included or generated;
-      // !!! but to meet xla::while requirement, we would skip first two arguments,
-      // !!! then add all other arguments like body_fn/init
-      // !!! --- additional_inputs_list: this list include all other arguments like body_fn/init except upper and lower
-      // !!! --- next step: we add dump paras according to additional_inputs_list
-      // ??? --- could we get IRvalue of `additional_inputs_list` in this function to complete xlacomputation?
-      int64_t parameter_idx = 2; // parameter_idx start from 2 after upper and lower
+      int64_t parameter_idx = 2; // parameter_idx start from 2 after used upper and lower
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
         xla::Shape shape = xtensor->shape().get();
@@ -935,11 +927,11 @@ class PyLoweringContext {
       }
     }
 
-    // hard-code modify body xlacomputation input arguments
-    // TODO(@manfei): get body xlacomputation arguments' number first then decide items in `additional_inputs_list`, maybe implement in python level
+    // hard-code modify body xlacomputation input arguments with unusedarguments for xla::while requriement
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameter_idx = 7; // tensors.size();
+      // TODO(@manfei): treat hard code parameter_idx value
+      int64_t parameter_idx = 7;
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
         xla::Shape shape = xtensor->shape().get();

From 6db813948ce2e9bfb3c526f68d5100f7d7d61e06 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:45:32 +0000
Subject: [PATCH 317/323] format

---
 torch_xla/csrc/init_python_bindings.cpp | 16 +---------------
 torch_xla/csrc/lowering_context.cpp     |  2 --
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index f48dcf9eb68..85545ae267e 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -966,20 +966,6 @@ class PyLoweringContext {
       std::vector<size_t> buffer_donor_indices;
       xla::ProgramShape program_shape =
           ConsumeValue(computation.GetProgramShape());
-      // // hard-code modify body xlacomputation input arguments
-      // // xxx: failed due to not change body_xlacomputation, might becase has been traced
-      // // xxx: after `computation = ConsumeValue(lowering_ctx.BuildXla());`
-      // if (GetNameString() == "bodyctx") {
-      //   xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      //   int64_t parameter_idx = program_shape.parameters_size(); // tensors.size();
-      //   for (auto& additional_input_tensor : additional_inputs_list) {
-      //     XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
-      //     xla::Shape shape = xtensor->shape().get();
-      //     xla::XlaOp x = xla::Parameter(local_builder, parameter_idx, shape,
-      //                                 "UnusedArgumentsPlaceholder");
-      //     parameter_idx += 1;
-      //   }
-      // }
       // TODO(@manfei): please confirm whether we check for more than two or use
       // default value true
       bool should_wrap_parameter = (program_shape.parameters_size() >= 2);
@@ -2635,4 +2621,4 @@ void InitXlaBindings(py::module m) { InitXlaModuleBindings(m); }
 
 }  // namespace torch_xla
 
-PYBIND11_MODULE(_XLAC, m) { torch_xla::InitXlaBindings(m); }
\ No newline at end of file
+PYBIND11_MODULE(_XLAC, m) { torch_xla::InitXlaBindings(m); }
diff --git a/torch_xla/csrc/lowering_context.cpp b/torch_xla/csrc/lowering_context.cpp
index 39f82a4887b..a530995ca78 100644
--- a/torch_xla/csrc/lowering_context.cpp
+++ b/torch_xla/csrc/lowering_context.cpp
@@ -160,8 +160,6 @@ xla::StatusOr<xla::XlaComputation> LoweringContext::BuildXla() {
   if (!root_tuple_.empty() & (root_tuple_.size() == 1) &
       ((get_name_string() == "condctx") or (get_name_string() == "bodyctx"))) {
     xla = builder()->Build(root_tuple_.at(0));
-  // } else if (!root_tuple_.empty() & (root_tuple_.size() == 1) & ) {
-  //   xla = builder()->Build(root_tuple_.at(0));
   } else if (!root_tuple_.empty()) {
     xla::XlaOp root = xla::Tuple(builder(), root_tuple_);
     xla = builder()->Build(root);

From da3556144b3f2c93a0b1282da38aed00f4952473 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:47:15 +0000
Subject: [PATCH 318/323] format

---
 torch_xla/experimental/fori_loop.py | 40 ++++++++++++++---------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index da1ebb4a0c9..a36649fcc3e 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -13,7 +13,7 @@
 from torch._higher_order_ops.while_loop import while_loop as torch_while_loop
 
 
-### TODO(@manfei): treat *input_value
+# TODO(@manfei): treat *input_value
 def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   device = xm.xla_device()
@@ -27,8 +27,8 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
-      weight = body_fun.weight  ### not be used actually, initialized as placeholder xlacomputation requirement
-      bias = body_fun.bias  ### not be used actually, initialized as placeholder xlacomputation requirement
+      weight = body_fun.weight  # not be used actually, initialized as placeholder xlacomputation requirement
+      bias = body_fun.bias  # not be used actually, initialized as placeholder xlacomputation requirement
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
     res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
   else:
@@ -46,16 +46,16 @@ def body_fn(upper, lower, one_value, x, input_value):
 
 @while_loop_op.py_impl(DispatchKey.XLA)
 def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
-  ### TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
-  ### cond_fn&body_fn: callable
-  ### carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
+  # TODO(@manfei): PyTorch require carried_inputs to be list/tuple, PyTorch/XLA _xla_while_loop only accept *operands, *operands would tuple items again: (a, '')
+  # cond_fn&body_fn: callable
+  # carried_inputs: (Tuple of possibly nested dict/list/tuple of tensors)
   if additional_inputs is None:
     additional_inputs = tuple()
   return _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs)
 
 
 def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
-  ### fake carried_inputs to split formal code
+  # fake carried_inputs to split formal code
   fake_carried_inputs = []
   for carried_input in carried_inputs:
     device = carried_input.device
@@ -68,41 +68,41 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
         torch.randint(10, additional_input.size(),
                       dtype=additional_input.dtype).to(device))
 
-  ### TODO(@manfei): specify which element is for which argument like a,b,c
+  # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
   cond_ctx = torch_xla._XLAC.lowering.LoweringContext()
   cond_ctx.set_name_string("condctx")
 
-  ### TODO(@manfei): treat hard-code cond xlacomputation change: currently switch output_value and weight position if additional_inputs(weight/bias) exists
-  additional_inputs_list_cond = list(fake_carried_inputs[2:]) ### all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  # TODO(@manfei): treat hard-code cond xlacomputation change: currently switch output_value and weight position if additional_inputs(weight/bias) exists
+  additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:
-    tmp_bias = additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
-    del additional_inputs_list_cond[-3] ### not used, change order doesn't affect logic
-    additional_inputs_list_cond.append(tmp_bias) ### not used, change order doesn't affect logic
+    tmp_bias = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+    del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
+    additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
   cond_computation = xb.computation_from_module_proto("condcomputation",
                                                       cond_hlo)
 
-  ### generate body_fn xlacomputation
+  # generate body_fn xlacomputation
   body_result = body_fn(*fake_carried_inputs)
   body_ctx = torch_xla._XLAC.lowering.LoweringContext()
   body_ctx.set_name_string("bodyctx")
 
-  ### TODO(@manfei): treat hard-code body xlacomputation change: currently add non-changed output_value argument if additional_inputs(weight/bias) exists
+  # TODO(@manfei): treat hard-code body xlacomputation change: currently add non-changed output_value argument if additional_inputs(weight/bias) exists
   if additional_inputs:
     additional_inputs_list_body = [fake_carried_inputs[-3]]
   else:
     additional_inputs_list_body = []
 
-  ### TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
+  # TODO(@manfei): treat hard-code parameters: additional_inputs_list_body
   body_ctx.buildforiloop(list(body_result), additional_inputs_list_body)
   body_hlo = body_ctx.hlo()
   body_computation = xb.computation_from_module_proto("bodycomputation",
                                                       body_hlo)
 
-  ### trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
+  # trans fake_carried_inputs from list(tensor) to list(xla::op), which part could change init of xla::while
   total_inputs = carried_inputs + additional_inputs
   kwargs = {}
   if type(total_inputs) is tuple:
@@ -115,13 +115,13 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
     p = xb.mkparam(builder, len(params), shape)
     params.append(p)
 
-  ### TODO(@manfei): treat hard-code input arguments, currently switch bias and output_value if additional_inputs(weight/bias) exists
+  # TODO(@manfei): treat hard-code input arguments, currently switch bias and output_value if additional_inputs(weight/bias) exists
   if additional_inputs:
     tmp_bias = params[-3]
     del params[-3]
     params.append(tmp_bias)
 
-  ### generate while xlacomputation
+  # generate while xlacomputation
   input_tuple = xb.Op.tuple(tuple(params))
   w = xb.mkop(
       'While', (input_tuple.op,),
@@ -130,7 +130,7 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   name = 'fori_loop_ed_torch_func'
   computation = w.build(name)
 
-  ### gain final result with generated while xlacomputation
+  # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (total_inputs),
                                                  computation)

From 431ab6627bf51f7a98baa7af6974207115278a8d Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:50:56 +0000
Subject: [PATCH 319/323] format

---
 torch_xla/csrc/init_python_bindings.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index 85545ae267e..f02fc059609 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -914,10 +914,12 @@ class PyLoweringContext {
   // needed in xlacomputation.
   void BuildForiLoop(std::vector<at::Tensor> tensors,
                      std::vector<at::Tensor> additional_inputs_list = {}) {
-    // hard-code modify cond xlacomputation input arguments with unusedarguments for xla::while requriement
+    // hard-code modify cond xlacomputation input arguments with unusedarguments
+    // for xla::while requriement
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
-      int64_t parameter_idx = 2; // parameter_idx start from 2 after used upper and lower
+      int64_t parameter_idx =
+          2; // parameter_idx start from 2 after used upper and lower
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
         xla::Shape shape = xtensor->shape().get();
@@ -927,7 +929,8 @@ class PyLoweringContext {
       }
     }
 
-    // hard-code modify body xlacomputation input arguments with unusedarguments for xla::while requriement
+    // hard-code modify body xlacomputation input arguments with unusedarguments
+    // for xla::while requriement
     if (GetNameString() == "bodyctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       // TODO(@manfei): treat hard code parameter_idx value

From 6244d21b590373e02c5cb054092c9fd5806434bc Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 22:52:24 +0000
Subject: [PATCH 320/323] format

---
 torch_xla/csrc/init_python_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index f02fc059609..e20e28fbb8f 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -919,7 +919,7 @@ class PyLoweringContext {
     if (GetNameString() == "condctx") {
       xla::XlaBuilder* local_builder = lowering_ctx.builder();
       int64_t parameter_idx =
-          2; // parameter_idx start from 2 after used upper and lower
+          2;  // parameter_idx start from 2 after used upper and lower
       for (auto& additional_input_tensor : additional_inputs_list) {
         XLATensorPtr xtensor = bridge::GetXlaTensor(additional_input_tensor);
         xla::Shape shape = xtensor->shape().get();

From 04ca72dceadd12e33a2abb216090efdee40a9c57 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 23:06:39 +0000
Subject: [PATCH 321/323] format

---
 ...while_loop_simple_add_dispatch_in_torch.py | 71 ++++++++++++-------
 torch_xla/experimental/fori_loop.py           | 41 +++++++----
 2 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 73d55d3dc47..3b2b018cada 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,9 +100,11 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = linear_0(input_value)
-      weight = linear_0.weight # not be used actually, initialized as placeholder xlacomputation requirement 
-      bias = linear_0.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() # , output_value_real
+      weight = linear_0.weight  # not be used actually, initialized as placeholder xlacomputation requirement 
+      bias = linear_0.bias  # not be used actually, initialized as placeholder xlacomputation requirement 
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
+          one_value, x), input_value.clone(), bias.clone(), weight.clone(
+          ), output_value.clone() # , output_value_real
 
     upper = torch.tensor([1], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
@@ -111,7 +113,9 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     l_in_0 = torch.rand(10, device=xm.xla_device())
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, l_in_0, output_value))
+    upper__, lower__, one_value__, torch_add_res__, input_value__, bias__, weight__, output_value_real__, = while_loop(
+        cond_fn, body_fn,
+        (upper, lower, one_value, init_val, l_in_0, output_value))
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
@@ -124,36 +128,48 @@ def test_while_loop_tpu_simple_linear_class(self):
     torch.set_grad_enabled(False)
 
     class SimpleWithLinear(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
-
-        def forward(self, upper, lower, one_value, x, input_value, output_value):
-            def cond_fn(upper, lower, one_value, x, input_value, output_value):
-                return lower[0] < upper[0]
-
-            def body_fn(upper, lower, one_value, x, input_value, output_value):
-                new_lower = torch.add(one_value, lower)
-                output_value_real = self.linear(input_value)
-                weight = self.linear.weight # not be used actually, initialized as placeholder xlacomputation requirement 
-                bias = self.linear.bias # not be used actually, initialized as placeholder xlacomputation requirement 
-                return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), output_value_real, weight.clone(), bias.clone()
-            return while_loop(cond_fn, body_fn, (upper, lower, one_value, x, input_value, output_value))
+
+      def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+
+      def forward(self, upper, lower, one_value, x, input_value, output_value):
+
+        def cond_fn(upper, lower, one_value, x, input_value, output_value):
+          return lower[0] < upper[0]
+
+        def body_fn(upper, lower, one_value, x, input_value, output_value):
+          new_lower = torch.add(one_value, lower)
+          output_value_real = self.linear(input_value)
+          weight = self.linear.weight  # not be used actually, initialized as placeholder xlacomputation requirement 
+          bias = self.linear.bias  # not be used actually, initialized as placeholder xlacomputation requirement 
+          return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
+              one_value, x), input_value.clone(
+              ), output_value_real, weight.clone(), bias.clone()
+
+        return while_loop(
+            cond_fn, body_fn,
+            (upper, lower, one_value, x, input_value, output_value))
 
     simple_with_linear = SimpleWithLinear()
     upper = torch.tensor([52], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.rand(10, device=xm.xla_device()) # input_value
+    l_in_0 = torch.rand(10, device=xm.xla_device())  # input_value
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     weight_0 = simple_with_linear.linear.weight
     bias_0 = simple_with_linear.linear.bias
 
-    aaa = {"simple_with_linear": (simple_with_linear, (upper, lower, one_value, init_val, l_in_0, output_value))}
+    aaa = {
+        "simple_with_linear":
+            (simple_with_linear, (upper, lower, one_value, init_val, l_in_0,
+                                  output_value))
+    }
 
-    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(upper, lower, one_value, init_val, l_in_0, output_value)
+    upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(
+        upper, lower, one_value, init_val, l_in_0, output_value)
 
     # same weight/bias liear model
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
@@ -177,7 +193,8 @@ def test_fori_loop_tpu_addition(self):
     def body_fun(a, b):
       return torch.add(a, b)
 
-    upper_, new_lower_, one_value_, add_res_x_, res_ = fori_loop(upper, lower, body_fun, one_value, init_val)
+    upper_, new_lower_, one_value_, add_res_x_, res_ = fori_loop(
+        upper, lower, body_fun, one_value, init_val)
     expected = _fake_fori_loop(lower, upper, body_fun, init_val, one_value)
     self.assertEqual(expected, res_)
 
@@ -191,15 +208,17 @@ def test_fori_loop_tpu_simple_linear(self):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
     l_in_0 = torch.randn(10, device=xm.xla_device())
-    
+
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
 
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(upper, lower, linear_0, init_val, l_in_0)
-    
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(
+        upper, lower, linear_0, init_val, l_in_0)
+
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
 
     self.assertTrue(torch.all(torch.eq(expected, l_out_)))
 
+
 if __name__ == '__main__':
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
\ No newline at end of file
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index a36649fcc3e..f07e9062f37 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -22,24 +22,36 @@ def fori_loop(upper, lower, body_fun, init_val, input_value):
 
   if (hasattr(body_fun, 'weight') or hasattr(body_fun, 'bias')):
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
+
     def cond_fn(upper, lower, one_value, x, input_value, output_value):
       return lower[0] < upper[0]
+
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = body_fun(input_value)
       weight = body_fun.weight  # not be used actually, initialized as placeholder xlacomputation requirement
       bias = body_fun.bias  # not be used actually, initialized as placeholder xlacomputation requirement
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), input_value.clone(), bias.clone(), weight.clone(), output_value.clone() 
-    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value, output_value))
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
+          one_value, x), input_value.clone(), bias.clone(), weight.clone(
+          ), output_value.clone()
+
+    res = torch_while_loop(
+        cond_fn, body_fn,
+        (upper, lower, one_value, init_val, input_value, output_value))
   else:
     output_value = torch.tensor([1], dtype=torch.int32, device=device)
+
     def cond_fn(upper, lower, one_value, x, input_value):
       return lower[0] < upper[0]
+
     def body_fn(upper, lower, one_value, x, input_value):
       new_lower = torch.add(one_value, lower)
       output_val = body_fun(one_value, input_value)
-      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(one_value, x), output_val.clone()
-    res = torch_while_loop(cond_fn, body_fn, (upper, lower, one_value, init_val, input_value))
+      return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
+          one_value, x), output_val.clone()
+
+    res = torch_while_loop(cond_fn, body_fn,
+                           (upper, lower, one_value, init_val, input_value))
 
   return res
 
@@ -60,8 +72,9 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   for carried_input in carried_inputs:
     device = carried_input.device
     fake_carried_inputs.append(
-        torch.randint(10, carried_input.size(),
-                      dtype=carried_input.dtype).to(device))
+        torch.randint(
+            10, carried_input.size(),
+            dtype=carried_input.dtype).to(device))
   for additional_input in additional_inputs:
     device = additional_input.device
     fake_carried_inputs.append(
@@ -74,11 +87,16 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   cond_ctx.set_name_string("condctx")
 
   # TODO(@manfei): treat hard-code cond xlacomputation change: currently switch output_value and weight position if additional_inputs(weight/bias) exists
-  additional_inputs_list_cond = list(fake_carried_inputs[2:]) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  additional_inputs_list_cond = list(
+      fake_carried_inputs[2:]
+  ) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:
-    tmp_bias = additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
-    del additional_inputs_list_cond[-3] # not used, change order doesn't affect logic
-    additional_inputs_list_cond.append(tmp_bias) # not used, change order doesn't affect logic
+    tmp_bias = additional_inputs_list_cond[
+        -3] # not used, change order doesn't affect logic
+    del additional_inputs_list_cond[
+        -3] # not used, change order doesn't affect logic
+    additional_inputs_list_cond.append(
+        tmp_bias) # not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
@@ -132,7 +150,6 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
 
   # gain final result with generated while xlacomputation
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
-                                                 (total_inputs),
-                                                 computation)
+                                                 (total_inputs), computation)
 
   return result
\ No newline at end of file

From 33fa1fbb30f3f305dbd2f1fe60d6225fc0abcf79 Mon Sep 17 00:00:00 2001
From: manfeibaigithub <manfei@google.com>
Date: Wed, 17 Apr 2024 23:12:49 +0000
Subject: [PATCH 322/323] format

---
 ...while_loop_simple_add_dispatch_in_torch.py | 18 ++++++++---------
 torch_xla/experimental/fori_loop.py           | 20 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
index 3b2b018cada..8a1f2bdb737 100644
--- a/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
+++ b/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
@@ -100,11 +100,11 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
     def body_fn(upper, lower, one_value, x, input_value, output_value):
       new_lower = torch.add(one_value, lower)
       output_value = linear_0(input_value)
-      weight = linear_0.weight  # not be used actually, initialized as placeholder xlacomputation requirement 
-      bias = linear_0.bias  # not be used actually, initialized as placeholder xlacomputation requirement 
+      weight = linear_0.weight  # not be used actually, initialized as placeholder xlacomputation requirement
+      bias = linear_0.bias  # not be used actually, initialized as placeholder xlacomputation requirement
       return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
           one_value, x), input_value.clone(), bias.clone(), weight.clone(
-          ), output_value.clone() # , output_value_real
+          ), output_value.clone()
 
     upper = torch.tensor([1], dtype=torch.int32, device=device)
     lower = torch.tensor([0], dtype=torch.int32, device=device)
@@ -141,8 +141,8 @@ def cond_fn(upper, lower, one_value, x, input_value, output_value):
         def body_fn(upper, lower, one_value, x, input_value, output_value):
           new_lower = torch.add(one_value, lower)
           output_value_real = self.linear(input_value)
-          weight = self.linear.weight  # not be used actually, initialized as placeholder xlacomputation requirement 
-          bias = self.linear.bias  # not be used actually, initialized as placeholder xlacomputation requirement 
+          weight = self.linear.weight  # not be used actually, initialized as placeholder xlacomputation requirement
+          bias = self.linear.bias  # not be used actually, initialized as placeholder xlacomputation requirement
           return upper.clone(), new_lower.clone(), one_value.clone(), torch.add(
               one_value, x), input_value.clone(
               ), output_value_real, weight.clone(), bias.clone()
@@ -156,7 +156,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     lower = torch.tensor([0], dtype=torch.int32, device=device)
     one_value = torch.tensor([1], dtype=torch.int32, device=device)
     init_val = torch.tensor([1], dtype=torch.int32, device=device)
-    l_in_0 = torch.rand(10, device=xm.xla_device())  # input_value
+    l_in_0 = torch.rand(10, device=xm.xla_device())
     output_value = torch.zeros([20], dtype=torch.float32, device=device)
 
     weight_0 = simple_with_linear.linear.weight
@@ -171,7 +171,7 @@ def body_fn(upper, lower, one_value, x, input_value, output_value):
     upper__, lower__, one_value__, torch_add_res__, input_value__, output_value_real__, weight__, bias__ = simple_with_linear(
         upper, lower, one_value, init_val, l_in_0, output_value)
 
-    # same weight/bias liear model
+    # create same weight/bias liear model for compare
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
     linear_0.weight.data = weight__
     linear_0.bias.data = bias__
@@ -211,7 +211,7 @@ def test_fori_loop_tpu_simple_linear(self):
 
     linear_0 = torch.nn.Linear(10, 20).to(xm.xla_device())
 
-    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_= fori_loop(
+    upper_, lower_, one_value_, add_res_x_, l_in_i_plus_1_, weight_, bias_, l_out_ = fori_loop(
         upper, lower, linear_0, init_val, l_in_0)
 
     expected = _fake_fori_loop(lower, upper, linear_0, l_in_0)
@@ -221,4 +221,4 @@ def test_fori_loop_tpu_simple_linear(self):
 
 if __name__ == '__main__':
   test = unittest.main()
-  sys.exit(0 if test.result.wasSuccessful() else 1)
\ No newline at end of file
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/torch_xla/experimental/fori_loop.py b/torch_xla/experimental/fori_loop.py
index f07e9062f37..8ed3a783200 100644
--- a/torch_xla/experimental/fori_loop.py
+++ b/torch_xla/experimental/fori_loop.py
@@ -72,14 +72,14 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   for carried_input in carried_inputs:
     device = carried_input.device
     fake_carried_inputs.append(
-        torch.randint(
-            10, carried_input.size(),
-            dtype=carried_input.dtype).to(device))
+        torch.randint(10, carried_input.size(),
+                      dtype=carried_input.dtype).to(device))
   for additional_input in additional_inputs:
     device = additional_input.device
     fake_carried_inputs.append(
-        torch.randint(10, additional_input.size(),
-                      dtype=additional_input.dtype).to(device))
+        torch.randint(
+            10, additional_input.size(),
+            dtype=additional_input.dtype).to(device))
 
   # TODO(@manfei): specify which element is for which argument like a,b,c
   cond_result = cond_fn(*fake_carried_inputs)
@@ -89,14 +89,14 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   # TODO(@manfei): treat hard-code cond xlacomputation change: currently switch output_value and weight position if additional_inputs(weight/bias) exists
   additional_inputs_list_cond = list(
       fake_carried_inputs[2:]
-  ) # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
+  )  # all missed arguments except upper/lower due to PyTorch/XLA trace from output tensor
   if additional_inputs:
     tmp_bias = additional_inputs_list_cond[
-        -3] # not used, change order doesn't affect logic
+        -3]  # not used, change order doesn't affect logic
     del additional_inputs_list_cond[
-        -3] # not used, change order doesn't affect logic
+        -3]  # not used, change order doesn't affect logic
     additional_inputs_list_cond.append(
-        tmp_bias) # not used, change order doesn't affect logic
+        tmp_bias)  # not used, change order doesn't affect logic
 
   cond_ctx.buildforiloop([cond_result], additional_inputs_list_cond)
   cond_hlo = cond_ctx.hlo()
@@ -152,4 +152,4 @@ def _xla_while_loop(cond_fn, body_fn, carried_inputs, additional_inputs=None):
   result = torch_xla._XLAC._xla_user_computation('xla::_op_test_while',
                                                  (total_inputs), computation)
 
-  return result
\ No newline at end of file
+  return result

From 332bd402429301d3b0d32a895f226f362aafde3c Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 31 May 2024 14:48:00 -0700
Subject: [PATCH 323/323] [rebase] rebase fori_loop_simple_case_test (#7165)

---
 .circleci/README.md                           |   19 -
 .circleci/doc_push.sh                         |   63 -
 .circleci/docker/cloudbuild.yaml              |   29 -
 .circleci/docker/install_llvm_clang.sh        |   33 -
 .circleci/setup_ci_environment.sh             |    2 +-
 .devcontainer/tpu-internal/devcontainer.json  |    6 +-
 .github/CODEOWNERS                            |    2 +-
 .github/ci.md                                 |  139 ++
 .github/scripts/run_tests.sh                  |  108 +
 .../docker => .github/upstream}/Dockerfile    |    1 +
 .../upstream}/install_conda.sh                |    0
 .../upstream}/install_valgrind.sh             |    0
 .github/workflows/_build.yml                  |  111 -
 .github/workflows/_build_plugin.yml           |    4 +-
 .github/workflows/_build_torch_with_cuda.yml  |   55 +
 .github/workflows/_build_torch_xla.yml        |   22 +-
 .github/workflows/_docs.yml                   |   84 +-
 .github/workflows/_get_torch_commit.yml       |   32 +
 .github/workflows/_test.yml                   |  151 +-
 .../workflows/_test_requiring_torch_cuda.yml  |  110 +
 .github/workflows/build_and_test.yml          |   61 +-
 .github/workflows/build_upstream_image.yml    |   40 +
 .github/workflows/lintercheck.yml             |    2 +-
 .github/workflows/torch_xla2.yml              |    6 +-
 .kokoro/Dockerfile                            |    2 +-
 .kokoro/presubmit.cfg                         |   10 +-
 BUILD                                         |   35 +
 CONTRIBUTING.md                               |   57 +-
 OP_LOWERING_GUIDE.md                          |    2 +-
 README.md                                     |   84 +-
 TROUBLESHOOTING.md                            |   97 +-
 WORKSPACE                                     |   57 +-
 benchmarks/benchmark_experiment.py            |   19 +-
 benchmarks/benchmark_model.py                 |   31 +-
 benchmarks/experiment_runner.py               |   29 +-
 benchmarks/requirements.txt                   |    3 +
 benchmarks/result_analyzer.py                 |    5 +
 benchmarks/run_benchmark.sh                   |    2 +-
 benchmarks/torchbench_model.py                |   47 +-
 benchmarks/util.py                            |   13 +-
 build_util.py                                 |    4 -
 codegen/BUILD                                 |    4 +
 codegen/xla_native_functions.yaml             |    1 +
 configuration.yaml                            |   21 -
 contrib/vscode/settings.json                  |   18 +-
 docs/README.md                                |   14 +-
 docs/assets/ci_test_dependency.png            |  Bin 0 -> 44721 bytes
 docs/assets/ci_test_dependency_gpu.png        |  Bin 0 -> 50628 bytes
 docs/fori_loop.md                             |  114 +
 docs/fsdpv2.md                                |   20 +-
 docs/gpu.md                                   |    7 +-
 docs/pallas.md                                |   57 +
 docs/requirements.txt                         |    4 +-
 docs/spmd.md                                  |   48 +-
 examples/README.md                            |   17 +
 examples/data_parallel/README.md              |    2 +
 examples/data_parallel/train_resnet_ddp.py    |   30 +
 .../train_resnet_spmd_data_parallel.py        |   49 +
 .../data_parallel/train_resnet_xla_ddp.py     |   25 +
 examples/debug/train_resnet_benchmark.py      |   49 +
 examples/debug/train_resnet_profile.py        |   30 +
 examples/decoder_only_model.py                |  227 ++
 .../train_decoder_only_flash_attention.py     |   33 +
 ...in_decoder_only_flash_attention_fsdp_v2.py |   36 +
 examples/fsdp/README.md                       |    2 +
 examples/fsdp/train_decoder_only_fsdp_v2.py   |   63 +
 examples/fsdp/train_resnet_fsdp_auto_wrap.py  |   56 +
 examples/train_decoder_only_base.py           |   73 +
 examples/train_resnet_amp.py                  |   35 +
 examples/train_resnet_base.py                 |   71 +
 experimental/torch_xla2/README.md             |   78 +-
 experimental/torch_xla2/dev-requirements.txt  |   12 +-
 experimental/torch_xla2/docs/dispatch.png     |  Bin 0 -> 150015 bytes
 .../torch_xla2/docs/fixing_op_info_test.md    |  211 ++
 experimental/torch_xla2/docs/how_it_works.md  |  134 ++
 experimental/torch_xla2/docs/ops_registry.md  |   40 +
 .../torch_xla2/examples/basic_training.py     |   31 +-
 .../torch_xla2/examples/basic_training_jax.py |   12 +-
 .../torch_xla2/examples/eager_mode.py         |   13 +-
 .../torch_xla2/examples/lightning_training.py |   77 +
 experimental/torch_xla2/pyproject.toml        |   28 +-
 experimental/torch_xla2/test-requirements.txt |    5 +
 .../torch_xla2/test/gemma/test_gemma.py       |    2 +-
 .../torch_xla2/test/llama/test_llama.py       |    5 +-
 experimental/torch_xla2/test/moe/__init__.py  |    0
 experimental/torch_xla2/test/moe/model.py     |  260 +++
 experimental/torch_xla2/test/moe/moe_test.py  |   75 +
 experimental/torch_xla2/test/test_context.py  |    8 +-
 .../torch_xla2/test/test_core_aten_ops.py     |  128 +-
 experimental/torch_xla2/test/test_exports.py  |  100 +-
 experimental/torch_xla2/test/test_extra.py    |   64 -
 .../torch_xla2/test/test_functions.py         |    6 +-
 .../torch_xla2/test/test_mutations.py         |   61 +-
 experimental/torch_xla2/test/test_ops.py      |   28 +-
 .../torch_xla2/test/test_symbolic_shapes.py   |   92 +
 .../test/test_unbounded_dynamism.py           |  662 ++++++
 experimental/torch_xla2/test_requirements.txt |    5 -
 .../torch_xla2/torch_xla2/__init__.py         |   29 +-
 experimental/torch_xla2/torch_xla2/_ops.py    | 1745 --------------
 .../torch_xla2/torch_xla2/decompositions.py   |   19 +-
 .../torch_xla2/torch_xla2/environment.py      |   24 -
 experimental/torch_xla2/torch_xla2/export.py  |  273 ++-
 experimental/torch_xla2/torch_xla2/extra.py   |   62 -
 .../torch_xla2/torch_xla2/functions.py        |  109 -
 experimental/torch_xla2/torch_xla2/interop.py |   69 +
 .../torch_xla2/torch_xla2/ops/__init__.py     |    9 +
 .../torch_xla2/torch_xla2/ops/jaten.py        | 1998 ++++++++++++++++-
 .../torch_xla2/torch_xla2/ops/jtorch.py       |   88 +
 .../torch_xla2/torch_xla2/ops/op_base.py      |   57 +-
 .../torch_xla2/torch_xla2/ops/ops_registry.py |   47 +
 .../torch_xla2/torch_xla2/ops_registry.py     |   74 -
 experimental/torch_xla2/torch_xla2/tensor.py  |  328 ++-
 experimental/torch_xla2/torch_xla2/types.py   |   12 +
 infra/ansible/config/env.yaml                 |   10 +-
 infra/ansible/config/vars.yaml                |    8 +
 .../ansible/roles/build_srcs/tasks/main.yaml  |   40 +
 .../artifacts.auto.tfvars                     |   50 +-
 infra/tpu-pytorch-releases/dev_images.tf      |    2 -
 .../cuda/torch_xla_cuda_plugin/__init__.py    |    3 +
 requirements.in                               |    9 +
 requirements_lock_3_10.txt                    |  153 ++
 requirements_lock_3_11.txt                    |  153 ++
 requirements_lock_3_8.txt                     |  153 ++
 requirements_lock_3_9.txt                     |  153 ++
 scripts/apply_patches.sh                      |    2 +-
 setup.py                                      |    8 +-
 test/benchmarks/run_tests.sh                  |   14 +-
 test/benchmarks/test_benchmark_experiment.py  |    5 +-
 test/benchmarks/test_experiment_runner.py     |   48 +-
 test/cpp/run_tests.sh                         |    2 +-
 test/cpp/test_aten_xla_tensor_4.cpp           |   13 +
 test/cpp/test_aten_xla_tensor_5.cpp           |   21 +
 test/debug_tool/extract_debug_helper.py       |   30 +
 test/debug_tool/test_mp_pt_xla_debug.py       |    4 +-
 test/debug_tool/test_pt_xla_debug.py          |  153 +-
 test/dynamo/test_dynamo.py                    |   15 +-
 test/pjrt/test_runtime_tpu.py                 |   16 +-
 test/pytorch_test_base.py                     |    1 +
 test/run_tests.sh                             |   14 +-
 test/spmd/test_dynamo_spmd.py                 |    2 +
 test/spmd/test_fsdp_v2.py                     |   76 +-
 test/spmd/test_xla_distributed_checkpoint.py  |   95 +-
 ...test_mark_pattern.py => test_composite.py} |    0
 test/stablehlo/test_export_fx_passes.py       |    4 +-
 test/stablehlo/test_exports.py                |    2 +-
 test/stablehlo/test_mlir_debuginfo.py         |   30 +-
 test/stablehlo/test_pt2e_qdq.py               |    6 +-
 test/stablehlo/test_stablehlo_custom_call.py  |  121 +
 test/stablehlo/test_unbounded_dynamism.py     |   54 +-
 test/test_devices.py                          |   57 +-
 test/test_gmm.py                              |  460 ++++
 test/test_input_output_aliases.py             |   44 +
 test/test_metrics.py                          |   54 +
 test/test_operations.py                       |  344 +++
 test/test_ops.py                              |    4 +-
 test/test_pallas.py                           |  415 +++-
 test/test_pallas_spmd.py                      |  110 +
 test/tpu/Dockerfile                           |    4 +-
 test/tpu/run_tests.sh                         |    9 +-
 torch_patches/README.md                       |   32 -
 torch_xla/__init__.py                         |   27 +-
 torch_xla/core/dynamo_bridge.py               |   10 +-
 torch_xla/core/xla_model.py                   |   16 +-
 torch_xla/csrc/BUILD                          |    2 +
 torch_xla/csrc/aten_cpu_fallback.cpp          |   12 +
 torch_xla/csrc/aten_cpu_fallback.h            |    4 +-
 torch_xla/csrc/aten_xla_type.cpp              |   92 +-
 torch_xla/csrc/debug_util.cpp                 |   90 +-
 torch_xla/csrc/debug_util.h                   |    4 +
 torch_xla/csrc/dl_convertor.cpp               |  345 +++
 torch_xla/csrc/dl_convertor.h                 |   14 +
 torch_xla/csrc/dtype.cpp                      |   14 +
 torch_xla/csrc/init_python_bindings.cpp       |  122 +-
 torch_xla/csrc/ops/custom_call.cpp            |   70 +
 torch_xla/csrc/ops/custom_call.h              |   29 +
 torch_xla/csrc/ops/embedding_bag.cpp          |  192 ++
 torch_xla/csrc/ops/embedding_bag.h            |   31 +
 torch_xla/csrc/ops/index_ops.cpp              |   47 +
 torch_xla/csrc/ops/xla_ops.cpp                |    1 +
 torch_xla/csrc/ops/xla_ops.h                  |    1 +
 torch_xla/csrc/reduction.cpp                  |    4 +-
 torch_xla/csrc/runtime/BUILD                  |    2 +-
 torch_xla/csrc/runtime/cache.h                |    1 +
 torch_xla/csrc/runtime/computation_client.h   |   25 +-
 .../csrc/runtime/ifrt_computation_client.cc   |   53 +-
 .../csrc/runtime/ifrt_computation_client.h    |   24 +-
 .../csrc/runtime/pjrt_computation_client.cc   |   75 +-
 .../csrc/runtime/pjrt_computation_client.h    |   49 +-
 torch_xla/csrc/runtime/pjrt_registry.cc       |   21 +-
 .../runtime/stablehlo_composite_helper.cc     |    2 +-
 torch_xla/csrc/runtime/xla_coordinator.h      |    2 +-
 torch_xla/csrc/tensor_methods.cpp             |   62 +-
 torch_xla/csrc/tensor_methods.h               |   11 +
 torch_xla/csrc/tensor_util.cpp                |   21 +
 torch_xla/csrc/tensor_util.h                  |    3 +
 torch_xla/csrc/unwrap_data.h                  |    3 +
 torch_xla/csrc/xla_graph_executor.cpp         |    2 +
 torch_xla/csrc/xla_manual_registration.cpp    |   14 +
 torch_xla/debug/metrics.py                    |    5 +
 torch_xla/distributed/spmd/__init__.py        |    2 +
 torch_xla/experimental/custom_kernel.py       |  700 +++++-
 .../distributed_checkpoint/__init__.py        |    2 +
 .../distributed_checkpoint/manager.py         |    5 +-
 .../distributed_checkpoint/util.py            |   44 +
 torch_xla/experimental/plugins.py             |    4 +-
 .../spmd_fully_sharded_data_parallel.py       |   34 +-
 .../experimental/stablehlo_custom_call.py     |   31 +
 torch_xla/experimental/xla_mlir_debuginfo.py  |    3 +-
 torch_xla/stablehlo.py                        |   69 +-
 torch_xla/torch_xla.py                        |   19 +
 torch_xla/utils/dlpack.py                     |   36 +
 211 files changed, 11677 insertions(+), 3540 deletions(-)
 delete mode 100644 .circleci/README.md
 delete mode 100755 .circleci/doc_push.sh
 delete mode 100644 .circleci/docker/cloudbuild.yaml
 delete mode 100644 .circleci/docker/install_llvm_clang.sh
 create mode 100644 .github/ci.md
 create mode 100755 .github/scripts/run_tests.sh
 rename {.circleci/docker => .github/upstream}/Dockerfile (98%)
 rename {.circleci/docker => .github/upstream}/install_conda.sh (100%)
 rename {.circleci/docker => .github/upstream}/install_valgrind.sh (100%)
 delete mode 100644 .github/workflows/_build.yml
 create mode 100644 .github/workflows/_build_torch_with_cuda.yml
 create mode 100644 .github/workflows/_get_torch_commit.yml
 create mode 100644 .github/workflows/_test_requiring_torch_cuda.yml
 create mode 100644 .github/workflows/build_upstream_image.yml
 create mode 100644 benchmarks/requirements.txt
 create mode 100644 docs/assets/ci_test_dependency.png
 create mode 100644 docs/assets/ci_test_dependency_gpu.png
 create mode 100644 docs/fori_loop.md
 create mode 100644 docs/pallas.md
 create mode 100644 examples/README.md
 create mode 100644 examples/data_parallel/README.md
 create mode 100644 examples/data_parallel/train_resnet_ddp.py
 create mode 100644 examples/data_parallel/train_resnet_spmd_data_parallel.py
 create mode 100644 examples/data_parallel/train_resnet_xla_ddp.py
 create mode 100644 examples/debug/train_resnet_benchmark.py
 create mode 100644 examples/debug/train_resnet_profile.py
 create mode 100644 examples/decoder_only_model.py
 create mode 100644 examples/flash_attention/train_decoder_only_flash_attention.py
 create mode 100644 examples/flash_attention/train_decoder_only_flash_attention_fsdp_v2.py
 create mode 100644 examples/fsdp/README.md
 create mode 100644 examples/fsdp/train_decoder_only_fsdp_v2.py
 create mode 100644 examples/fsdp/train_resnet_fsdp_auto_wrap.py
 create mode 100644 examples/train_decoder_only_base.py
 create mode 100644 examples/train_resnet_amp.py
 create mode 100644 examples/train_resnet_base.py
 create mode 100644 experimental/torch_xla2/docs/dispatch.png
 create mode 100644 experimental/torch_xla2/docs/fixing_op_info_test.md
 create mode 100644 experimental/torch_xla2/docs/how_it_works.md
 create mode 100644 experimental/torch_xla2/docs/ops_registry.md
 create mode 100644 experimental/torch_xla2/examples/lightning_training.py
 create mode 100644 experimental/torch_xla2/test-requirements.txt
 create mode 100644 experimental/torch_xla2/test/moe/__init__.py
 create mode 100644 experimental/torch_xla2/test/moe/model.py
 create mode 100644 experimental/torch_xla2/test/moe/moe_test.py
 delete mode 100644 experimental/torch_xla2/test/test_extra.py
 create mode 100644 experimental/torch_xla2/test/test_symbolic_shapes.py
 create mode 100644 experimental/torch_xla2/test/test_unbounded_dynamism.py
 delete mode 100644 experimental/torch_xla2/test_requirements.txt
 delete mode 100644 experimental/torch_xla2/torch_xla2/_ops.py
 delete mode 100644 experimental/torch_xla2/torch_xla2/extra.py
 delete mode 100644 experimental/torch_xla2/torch_xla2/functions.py
 create mode 100644 experimental/torch_xla2/torch_xla2/interop.py
 create mode 100644 experimental/torch_xla2/torch_xla2/ops/ops_registry.py
 delete mode 100644 experimental/torch_xla2/torch_xla2/ops_registry.py
 create mode 100644 experimental/torch_xla2/torch_xla2/types.py
 create mode 100644 requirements.in
 create mode 100644 requirements_lock_3_10.txt
 create mode 100644 requirements_lock_3_11.txt
 create mode 100644 requirements_lock_3_8.txt
 create mode 100644 requirements_lock_3_9.txt
 rename test/stablehlo/{test_mark_pattern.py => test_composite.py} (100%)
 create mode 100644 test/stablehlo/test_stablehlo_custom_call.py
 create mode 100644 test/test_gmm.py
 create mode 100644 test/test_pallas_spmd.py
 delete mode 100644 torch_patches/README.md
 create mode 100644 torch_xla/csrc/dl_convertor.cpp
 create mode 100644 torch_xla/csrc/dl_convertor.h
 create mode 100644 torch_xla/csrc/ops/custom_call.cpp
 create mode 100644 torch_xla/csrc/ops/custom_call.h
 create mode 100644 torch_xla/csrc/ops/embedding_bag.cpp
 create mode 100644 torch_xla/csrc/ops/embedding_bag.h
 create mode 100644 torch_xla/experimental/distributed_checkpoint/util.py
 create mode 100644 torch_xla/experimental/stablehlo_custom_call.py
 create mode 100644 torch_xla/utils/dlpack.py

diff --git a/.circleci/README.md b/.circleci/README.md
deleted file mode 100644
index d01e6138317..00000000000
--- a/.circleci/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# CircleCI Overview
-PyTorch and PyTorch/XLA use CircleCI to lint, build, and test each PR that is submitted. All CircleCI tests should succeed before the PR is merged into master. PyTorch CircleCI pins PyTorch/XLA to a specific commit. On the other hand, PyTorch/XLA CircleCI pulls PyTorch from master unless a pin is manually provided. This README will go through the reasons of these pins, how to pin a PyTorch/XLA PR to an upstream PyTorch PR, and how to coordinate a merge for breaking PyTorch changes.
-
-## Why does PyTorch CircleCI pin PyTorch/XLA?
-As mentioned above, [PyTorch CircleCI pins PyTorch/XLA](https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/common_utils.sh#L119) to a "known good" commit to prevent accidental changes from PyTorch/XLA to break PyTorch CircleCI without warning. PyTorch has hundreds of commits each week, and this pin ensures that PyTorch/XLA as a downstream package does not cause failures in PyTorch CircleCI. 
-
-## Why does PyTorch/XLA CircleCI pull from PyTorch master?
-[PyTorch/XLA CircleCI pulls PyTorch from master](https://github.com/pytorch/xla/blob/f3415929683880192b63b285921c72439af55bf0/.circleci/common.sh#L15) unless a PyTorch pin is manually provided. PyTorch/XLA is a downstream package to PyTorch, and pulling from master ensures that PyTorch/XLA will stay up-to-date and works with the latest PyTorch changes.
-
-## Pinning PyTorch PR in PyTorch/XLA PR
-Sometimes a PyTorch/XLA PR needs to be pinned to a specific PyTorch PR to test new featurues, fix breaking changes, etc. Since PyTorch/XLA CircleCI pulls from PyTorch master by default, we need to manually provided a PyTorch pin. In a PyTorch/XLA PR, PyTorch an be manually pinned by creating a `.torch_pin` under `/torch_patches`. The `.torch_pin` should have the corresponding PyTorch PR number prefixed by "#". Take a look at [example here](https://github.com/pytorch/xla/pull/3792/commits/40f41fb98b0f2386d287eeac0bae86e873d4a9d8). Before the PyTorch/XLA PR gets merged, the `.torch_pin` must be deleted. 
-
-## Coodinating merges for breaking PyTorch PRs
-When PyTorch PR introduces a breaking change, its PyTorch/XLA CircleCI tests will fail. Steps for fixing and merging such breaking PyTorch change is as following: 
-1. Create a PyTorch/XLA PR to fix this issue with `.torch_pin` and rebase with master to ensure the PR is up-to-date with the latest commit on PyTorch/XLA. Once this PR is created, it'll create a commit hash that will be used in step 2. If you have multiple commits in the PR, use the last one's hash. **Important note: When you rebase this PR, it'll create a new commit hash and make the old hash obsolete. Be cautious about rebasing, and if you rebase, make sure you inform the PyTorch PR's author.**
-2. Rebase (or ask the PR owner to rebase) the PyTorch PR with master. Update the PyTorch PR to pin the PyTorch/XLA to the commit hash created in step 1 by updating `pytorch/.github/ci_commit_pins/xla.txt`.
-3. Once CircleCI tests are green on both ends, merge PyTorch PR.
-4. Remove the `.torch_pin` in PyTorch/XLA PR and merge. To be noted, `git commit --amend` should be avoided in this step as PyTorch CI will keep using the commit hash created in step 1 until other PRs update that manually or the nightly buildbot updates that automatically.
-5. Finally, don't delete your branch until 2 days later. See step 4 for explanations.
diff --git a/.circleci/doc_push.sh b/.circleci/doc_push.sh
deleted file mode 100755
index 72b4a44f6e7..00000000000
--- a/.circleci/doc_push.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd /tmp/pytorch/xla
-
-source ./xla_env
-source .circleci/common.sh
-
-echo "Building docs"
-pushd docs
-./docs_build.sh
-popd
-
-echo "Pushing to public"
-git config --global user.email "pytorchxla@gmail.com"
-git config --global user.name "torchxlabot2"
-GH_PAGES_BRANCH=gh-pages
-GH_PAGES_DIR=gh-pages-tmp
-CURRENT_COMMIT=`git rev-parse HEAD`
-BRANCH_NAME=`git rev-parse --abbrev-ref HEAD`
-if [[ "$BRANCH_NAME" == release/* ]]; then
-  SUBDIR_NAME=$BRANCH_NAME
-else
-  SUBDIR_NAME="master"
-fi
-pushd /tmp
-git clone --quiet -b "$GH_PAGES_BRANCH" https://github.com/pytorch/xla.git "$GH_PAGES_DIR"
-pushd $GH_PAGES_DIR
-rm -rf $SUBDIR_NAME
-mkdir -p $SUBDIR_NAME
-cp -fR /tmp/pytorch/xla/docs/build/* $SUBDIR_NAME
-git_status=$(git status --porcelain)
-if [[ $git_status ]]; then
-  echo "Doc is updated... Pushing to public"
-  echo "${git_status}"
-  sudo apt-get -qq update
-  export DEBIAN_FRONTEND=noninteractive
-  sudo ln -snf /usr/share/zoneinfo/Etc/UTC /etc/localtime
-  sudo sh -c "echo Etc/UTC > /etc/timezone"
-  sudo apt-get -qq -y install tzdata
-  sudo apt-get -qq install expect
-  git add .
-
-  COMMIT_MSG="Update doc from commit $CURRENT_COMMIT"
-  git commit -m "$COMMIT_MSG"
-  set +x
-/usr/bin/expect <<DONE
-spawn git push origin "$GH_PAGES_BRANCH"
-expect "Username*"
-send "torchxlabot2\n"
-expect "Password*"
-send "$::env(GITHUB_TORCH_XLA_BOT_TOKEN)\n"
-expect eof
-DONE
-  set -x
-else
-  echo "Nothing changed in documentation."
-fi
-popd
-popd
-
-
diff --git a/.circleci/docker/cloudbuild.yaml b/.circleci/docker/cloudbuild.yaml
deleted file mode 100644
index d3b154e0a65..00000000000
--- a/.circleci/docker/cloudbuild.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Cloud Build Configuration for .circleci base image build:
-# - Builds and pushes gcr.io/tpu-pytorch/xla_base:{_TAG_NAME}.
-# - This build is meant to be triggered manually in cloud builder
-steps:
-- name: 'gcr.io/cloud-builders/docker'
-  args: [
-          'build',
-          '--build-arg', 'cloud_build_prefix=.cicleci/docker',
-          '--build-arg', 'base_image=nvidia/cuda:${_CUDA_TAG}',
-          '--build-arg', 'cuda=${_CUDA}',
-          '--build-arg', 'python_version=${_PYTHON_VERSION}',
-          '-t', 'gcr.io/tpu-pytorch/xla_base:${_TAG_NAME}',
-          '-f', '.circleci/docker/Dockerfile', '.circleci/docker/'
-        ]
-  timeout: 2000s
-- name: 'gcr.io/cloud-builders/docker'
-  args: ['push', 'gcr.io/tpu-pytorch/xla_base']
-  timeout: 1000s
-
-options:
-    machineType: 'N1_HIGHCPU_32'
-    dynamic_substitutions: true
-    substitution_option: 'ALLOW_LOOSE'
-substitutions:
-    _CUDA: '1'
-    _CUDA_TAG: '11.7.0-cudnn8-devel-ubuntu18.04'
-    _PYTHON_VERSION: '3.7'
-    _TAG_NAME: '${_PYTHON_VERSION}-${_CUDA_TAG}-mini'
-timeout: 3000s
diff --git a/.circleci/docker/install_llvm_clang.sh b/.circleci/docker/install_llvm_clang.sh
deleted file mode 100644
index a911bce1aad..00000000000
--- a/.circleci/docker/install_llvm_clang.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -e
-set -x
-
-
-function debian_version {
-  local VER
-  if ! sudo apt-get install -y lsb-release > /dev/null 2>&1 ; then
-    VER="buster"
-  else
-    VER=$(lsb_release -c -s)
-  fi
-  echo "$VER"
-}
-
-function install_llvm_clang() {
-  local DEBVER=$(debian_version)
-  if ! apt-get install -y -s clang-8 > /dev/null 2>&1 ; then
-    maybe_append "deb http://apt.llvm.org/${DEBVER}/ llvm-toolchain-${DEBVER}-8 main" /etc/apt/sources.list
-    maybe_append "deb-src http://apt.llvm.org/${DEBVER}/ llvm-toolchain-${DEBVER}-8 main" /etc/apt/sources.list
-    wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-    sudo apt-get update
-  fi
-  # Build config also sets CC=clang-8, CXX=clang++-8
-  sudo apt-get install -y clang-8 clang++-8
-  sudo apt-get install -y llvm-8 llvm-8-dev llvm-8-tools
-  sudo ln -s /usr/bin/clang-8 /usr/bin/clang
-  sudo ln -s /usr/bin/clang++-8 /usr/bin/clang++
-  export CC=clang-8 CXX=clang++-8
-}
-
-install_llvm_clang
diff --git a/.circleci/setup_ci_environment.sh b/.circleci/setup_ci_environment.sh
index eba2c373b8a..87a61524e7e 100755
--- a/.circleci/setup_ci_environment.sh
+++ b/.circleci/setup_ci_environment.sh
@@ -58,7 +58,7 @@ sudo apt-get -y remove linux-image-generic linux-headers-generic linux-generic d
 # How to figure out what the correct versions of these packages are?
 # My preferred method is to start a Docker instance of the correct
 # Ubuntu version (e.g., docker run -it ubuntu:16.04) and then ask
-# apt what the packages you need are.  Note that the CircleCI image
+# apt what the packages you need are.  Note that the CI image
 # comes with Docker.
 #
 # Using 'retry' here as belt-and-suspenders even though we are
diff --git a/.devcontainer/tpu-internal/devcontainer.json b/.devcontainer/tpu-internal/devcontainer.json
index 4358bd5612f..a0684c8f90f 100644
--- a/.devcontainer/tpu-internal/devcontainer.json
+++ b/.devcontainer/tpu-internal/devcontainer.json
@@ -17,14 +17,14 @@
         "llvm-vs-code-extensions.vscode-clangd",
         "ms-vscode.cpptools-themes",
         "BazelBuild.vscode-bazel",
-        "DevonDCarew.bazel-code",
         "StackBuild.bazel-stack-vscode",
         "StackBuild.bazel-stack-vscode-cc",
         "xaver.clang-format",
         "ryanluker.vscode-coverage-gutters",
         "ms-azuretools.vscode-docker",
-        "ms-python.python"
+        "ms-python.python",
+        "eeyore.yapf"
       ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 87072a65bce..bfff4ef8422 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1 @@
-/infra @will-cromar @JackCaoG @yeounoh @mateuszlewko @stgpetrovic
+/infra @will-cromar @JackCaoG @lsy323
diff --git a/.github/ci.md b/.github/ci.md
new file mode 100644
index 00000000000..4f39ad4bd40
--- /dev/null
+++ b/.github/ci.md
@@ -0,0 +1,139 @@
+# CI Overview
+
+PyTorch and PyTorch/XLA use CI to lint, build, and test each PR that is submitted. All CI tests should succeed before the PR is merged into master. PyTorch CI pins PyTorch/XLA to a specific commit. On the other hand, PyTorch/XLA CI pulls PyTorch from master unless a pin is manually provided. This README will go through the reasons of these pins, how to pin a PyTorch/XLA PR to an upstream PyTorch PR, and how to coordinate a merge for breaking PyTorch changes.
+
+## Usage
+
+### Pinning PyTorch PR in PyTorch/XLA PR
+
+Sometimes a PyTorch/XLA PR needs to be pinned to a specific PyTorch PR to test new featurues, fix breaking changes, etc. Since PyTorch/XLA CI pulls from PyTorch master by default, we need to manually provided a PyTorch pin. In a PyTorch/XLA PR, PyTorch an be manually pinned by creating a `.torch_pin` file at the root of the repository. The `.torch_pin` should have the corresponding PyTorch PR number prefixed by "#". Take a look at [example here](https://github.com/pytorch/xla/pull/3792/commits/40f41fb98b0f2386d287eeac0bae86e873d4a9d8). Before the PyTorch/XLA PR gets merged, the `.torch_pin` must be deleted.
+
+### Coodinating merges for breaking PyTorch PRs
+
+When PyTorch PR introduces a breaking change, its PyTorch/XLA CI tests will fail. Steps for fixing and merging such breaking PyTorch change is as following:
+1. Create a PyTorch/XLA PR to fix this issue with `.torch_pin` and rebase with master to ensure the PR is up-to-date with the latest commit on PyTorch/XLA. Once this PR is created, it'll create a commit hash that will be used in step 2. If you have multiple commits in the PR, use the last one's hash. **Important note: When you rebase this PR, it'll create a new commit hash and make the old hash obsolete. Be cautious about rebasing, and if you rebase, make sure you inform the PyTorch PR's author.**
+2. Rebase (or ask the PR owner to rebase) the PyTorch PR with master. Update the PyTorch PR to pin the PyTorch/XLA to the commit hash created in step 1 by updating `pytorch/.github/ci_commit_pins/xla.txt`.
+3. Once CI tests are green on both ends, merge PyTorch PR.
+4. Remove the `.torch_pin` in PyTorch/XLA PR and merge. To be noted, `git commit --amend` should be avoided in this step as PyTorch CI will keep using the commit hash created in step 1 until other PRs update that manually or the nightly buildbot updates that automatically.
+5. Finally, don't delete your branch until 2 days later. See step 4 for explanations.
+
+### Running TPU tests on PRs
+
+By default, we only run TPU tests on a postsubmit basis to save capacity. If you are making a sensitive change, add the `tpuci` label to your PR. Note that the label must be present before `build_and_test.yml` triggers. If it has already run, make a new commit or rebase to trigger the CI again.
+
+## CI Environment
+
+Before the CI in this repository runs, we build a the base dev image. These are the same images we recommend in our VSCode `.devcontainer` setup and nightly build to ensure consistency between environments. We produce variants with and without CUDA, configured in `infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf` (build triggers).
+
+The CI runs in two environments:
+
+1. Organization self-hosted runners for CPU and GPU: used for amost every step of the CI. These runners are managed by PyTorch and have access to the shared ECR repository.
+2. TPU self-hosted runners: these are managed by us and are only availabe in the `pytorch/xla` repository. See the [_TPU CI_](#tpu-ci) section for more details.
+
+## Build and test (`build_and_test.yml`)
+
+We have two build paths for each CI run:
+
+- `torch_xla`: we build the main package to support for both TPU and GPU[^1], along with a CPU bild of `torch` from HEAD. This build step exports the `torch-xla-wheels` artifact for downstream use in tests.
+  - Some CI tests also require `torchvision`. To reduce flakiness, we compile `torchvision` from [`torch`'s CI pin](https://github.com/pytorch/pytorch/blob/main/.github/ci_commit_pins/vision.txt).
+  - C++ tests are piggybacked onto the same build and uploaded in the `cpp-test-bin` artifact.
+- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus, this build should be almost entirely cached, unless your PR changes the XLA pin or adds a patch.
+
+Both the main package build and plugin build are configured with ansible at `infra/ansible`, although they run in separate stages (`stage=build_srcs` vs `stage=build_plugin`). This is the same configuration we use for our nightly and release builds.
+
+The CPU and GPU test configs are defined in the same file, `_test.yml`. Since some of the tests come from the upstream PyTorch repository, we check out PyTorch at the same git rev as the `build` step (taken from `torch_xla.version.__torch_gitrev__`). The tests are split up into multiple groups that run in parallel; the `matrix` section of `_test.yml` corresponds to in `.github/scripts/run_tests.sh`.
+
+CPU tests run immediately after then `torch_xla` build completes. This will likely be the first test feedback on your commit. GPU tests will launch when both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is much slower due to the number of possible optimizations, and the GPU chips themselves are quite outdated, so these tests will take longer to run than the CPU tests.
+
+![CPU tests launch when `torch_xla` is complete](../docs/assets/ci_test_dependency.png)
+
+![GPU tests also depend on CUDA plugin](../docs/assets/ci_test_dependency_gpu.png)
+
+For the C++ test groups in either case, the test binaries are pre-built during the build phase and packaged in `cpp-test-bin`. This will only be downloaded if necessary.
+
+[^1]: Note: both GPU and TPU support require their respective plugins to be installed. This package will _not_ work on either out of the box.
+
+### TPU CI
+
+The TPU CI runs only a subset of our tests due to capacity constraints, defined in `_tpu_ci.yml` `test/tpu/run_tests.sh`. The runners themselves are containers in GKE managed by [ARC](https://github.com/actions/actions-runner-controller). The container image is also based on our dev images, with some changes for ARC compatibility. The Dockerfile for this image lives in `test/tpu/Dockerfile`.
+
+The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch/tpu_ci.yml`.
+
+### Reproducing test failures
+
+The best way to reproduce failures in the CI is to use the recommended container configuration in `.devcontainer`. These use identical images/environments as the CI.
+
+If you cannot reproduce the failure or need to inspect the package built in a CI run, you can download the `torch-xla-wheels` artifact for that run, [either locally in your web browser or remotely with the `gh` CLI tool](https://docs.github.com/en/actions/managing-workflow-runs/downloading-workflow-artifacts). C++ tests in particular can be quite slow to build. If you need to re-run these yourself, download the `cpp-test-bin` artifact. You'll have to set some additional environment variables for these to load the correct `torch` and plugin binaries, so you should copy the variables we set in `_test.yml` before runnign them.
+
+### Generating docs
+
+Our API documentation is generated automatically from the `torch_xla` package with `sphinx`. The workflow to update our static site is defined in `_docs.yml`. The workflow is roughly the following:
+
+- Changes to `master` update the docs at `/master` on the `gh-pages` branch.
+- Changes to a release brance update the docs under `/releases/rX.Y`.
+
+By default, we redirect to the latest stable version, defined in [`index.md`](https://github.com/pytorch/xla/blob/gh-pages/index.md).
+
+We build preview docs for every CI, but only push to `gh-pages` for `master` and release branches. To preview doc changes, download the `github-pages` artifact locally and open `index.html` in your browser.
+
+Changes to `gh-pages` are pushed by our bot account, `torchxlabot2`.
+
+### FAQ and Troubleshooting
+
+#### Why does PyTorch CI pin PyTorch/XLA?
+
+As mentioned above, [PyTorch CI pins PyTorch/XLA](https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/common_utils.sh#L119) to a "known good" commit to prevent accidental changes from PyTorch/XLA to break PyTorch CI without warning. PyTorch has hundreds of commits each week, and this pin ensures that PyTorch/XLA as a downstream package does not cause failures in PyTorch CI.
+
+#### Why does PyTorch/XLA CI pull from PyTorch master?
+
+[PyTorch/XLA CI pulls PyTorch from master](https://github.com/pytorch/xla/blob/f3415929683880192b63b285921c72439af55bf0/.circleci/common.sh#L15) unless a PyTorch pin is manually provided. PyTorch/XLA is a downstream package to PyTorch, and pulling from master ensures that PyTorch/XLA will stay up-to-date and works with the latest PyTorch changes.
+
+#### TPU CI is broken
+
+If the TPU CI won't run, try to debug using the following steps:
+
+On your cloudtop:
+
+```
+gcloud config set project tpu-pytorch
+gcloud container clusters get-credentials tpu-ci --location=us-central2
+```
+
+Check to see if the runner pod is working:
+
+```
+kubectl get pods -n arc-runners
+```
+
+If it is working, check the logs:
+
+```
+kubectl logs -n arc-runners <runner-pod-name>
+```
+
+If there is no runner pod available, you can check the controller logs. First find the controller pod name:
+
+```
+kubectl get pods -n arc-systems
+```
+
+The name should match actions-runner-controller-gha-rs-controller-*. You can then check the logs by running the following:
+
+```
+kubectl logs -n arc-systems <controller-pod-name>
+```
+
+If the ephemeralrunner spawning the runner pods is stuck in an error, you can attempt the following to restart the ephemeralrunner and check the logs:
+
+```
+kubectl delete ephemeralrunners --all -A
+kubectl logs -f -n arc-runners $(kubectl get pods -n arc-runners -l 'actions.github.com/scale-set-name=v4-runner-set' -o jsonpath='{.items[0].metadata.name}')
+```
+
+## Upstream CI image (`build_upstream_image.yml`)
+
+We use different build tools than the upstream `torch` repository due to our dependency on XLA, namely `bazel`. To ensure the upstream CI has the correct tools to run XLA, we layer some additional tools and changes on top of our dev image and push the result to upstream's ECR instance. The upstream CI image is defined in `.github/upstream`.
+
+If you are making a breaking change to the image, bump the image version tag in `build_upstream_image.yml` first and then send a PR to `pytorch/pytorch` to update the tag on their side ([example](https://github.com/pytorch/pytorch/pull/125319)).
+
+Note: the upstream CI still relies on some legacy scripts in `.circleci` rather than our Ansible config. Don't update these without checking if they break the upstream CI first! TODO: finally delete these.
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
new file mode 100755
index 00000000000..ae59a51490d
--- /dev/null
+++ b/.github/scripts/run_tests.sh
@@ -0,0 +1,108 @@
+set -ex
+
+function run_torch_xla_python_tests() {
+  PYTORCH_DIR=$1
+  XLA_DIR=$2
+  USE_COVERAGE="${3:-0}"
+
+  pushd $XLA_DIR
+    echo "Running Python Tests"
+    if [ "$USE_COVERAGE" != "0" ]; then
+      pip install coverage==6.5.0 --upgrade
+      pip install coverage-lcov
+      pip install toml
+      ./test/run_tests.sh
+      coverage combine
+      mkdir lcov && cp .coverage lcov/
+      coverage-lcov --data_file_path lcov/.coverage
+      coverage html
+      cp lcov.info htmlcov/
+      mv htmlcov ~/
+      chmod -R 755 ~/htmlcov
+    else
+      ./test/run_tests.sh
+    fi
+  popd
+}
+
+function run_torch_xla_cpp_tests() {
+  PYTORCH_DIR=$1
+  XLA_DIR=$2
+  USE_COVERAGE="${3:-0}"
+
+  TORCH_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch').get_filename()))")
+  export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${TORCH_DIR}/lib
+  if [ -x "$(command -v nvidia-smi)" ]; then
+    CUDA_PLUGIN_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch_xla_cuda_plugin').get_filename()))")
+    export PJRT_LIBRARY_PATH=$CUDA_PLUGIN_DIR/lib/pjrt_c_api_gpu_plugin.so
+    export PJRT_DEVICE=LIBRARY
+    export PJRT_DYNAMIC_PLUGINS=1
+  else
+    export PJRT_DEVICE=CPU
+  fi
+  export XLA_EXPERIMENTAL="nonzero:masked_select:nms"
+
+  test_names1=("test_aten_xla_tensor_1"
+               "test_aten_xla_tensor_2"
+               "test_aten_xla_tensor_3"
+               "test_aten_xla_tensor_4"
+               "pjrt_computation_client_test"
+               "ifrt_computation_client_test")
+  test_names2=("test_aten_xla_tensor_5"
+               "test_aten_xla_tensor_6"
+               "test_ir"
+               "test_lazy"
+               "test_replication"
+               "test_tensor"
+               # disable test_xla_backend_intf since it is flaky on upstream
+               #"test_xla_backend_intf"
+               "test_xla_sharding")
+  if [[ "$RUN_CPP_TESTS1" == "cpp_tests1" ]]; then
+    test_names=("${test_names1[@]}")
+  elif [[ "$RUN_CPP_TESTS2" == "cpp_tests2" ]]; then
+    test_names=("${test_names2[@]}")
+  else
+    test_names=("${test_names1[@]}" "${test_names2[@]}")
+  fi
+
+  for name in "${test_names[@]}"; do
+    echo "Running $name cpp test..."
+    /tmp/test/bin/${name}
+  done
+}
+
+function run_torch_xla_benchmark_tests() {
+  XLA_DIR=$1
+  pushd $XLA_DIR
+    echo "Running Benchmark Tests"
+    test/benchmarks/run_tests.sh -L""
+}
+
+PYTORCH_DIR=$1
+XLA_DIR=$2
+USE_COVERAGE="${3:-0}"
+RUN_CPP="${RUN_CPP_TESTS:0}"
+RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
+
+if [ -x "$(command -v nvidia-smi)" ]; then
+  num_devices=$(nvidia-smi --list-gpus | wc -l)
+  echo "Found $num_devices GPU devices..."
+  export GPU_NUM_DEVICES=$num_devices
+fi
+export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
+export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
+
+if [[ -z "$RUN_BENCHMARK_TESTS" && -z "$RUN_CPP_TESTS1" && -z "$RUN_CPP_TESTS2" && -z "$RUN_PYTHON_TESTS" ]]; then
+  run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+  run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+  run_torch_xla_benchmark_tests $XLA_DIR
+else
+  # run tests separately.
+  if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then
+    run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+  elif [[ "$RUN_BENCHMARK_TESTS" == "benchmark_tests" ]]; then
+    run_torch_xla_benchmark_tests $XLA_DIR
+  else
+    run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+  fi
+fi
diff --git a/.circleci/docker/Dockerfile b/.github/upstream/Dockerfile
similarity index 98%
rename from .circleci/docker/Dockerfile
rename to .github/upstream/Dockerfile
index f0cd196511c..006460c2477 100644
--- a/.circleci/docker/Dockerfile
+++ b/.github/upstream/Dockerfile
@@ -1,3 +1,4 @@
+# Dockerfile for image used by upstream CI
 # This requires cuda & cudnn packages pre-installed in the base image.
 # Other available cuda images are listed at https://hub.docker.com/r/nvidia/cuda
 ARG base_image="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1"
diff --git a/.circleci/docker/install_conda.sh b/.github/upstream/install_conda.sh
similarity index 100%
rename from .circleci/docker/install_conda.sh
rename to .github/upstream/install_conda.sh
diff --git a/.circleci/docker/install_valgrind.sh b/.github/upstream/install_valgrind.sh
similarity index 100%
rename from .circleci/docker/install_valgrind.sh
rename to .github/upstream/install_valgrind.sh
diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
deleted file mode 100644
index 789d0579272..00000000000
--- a/.github/workflows/_build.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-name: xla-buld
-on:
-  workflow_call:
-    inputs:
-      gcr-docker-image:
-        required: true
-        type: string
-        description: Base image for builds
-      ecr-docker-image-base:
-        required: true
-        type: string
-        description: Container registry to upload image to
-      runner:
-        required: false
-        type: string
-        description: Runner type for the test
-        default: linux.12xlarge
-      cuda:
-        required: false
-        type: string
-        description: Whether to build XLA with CUDA
-        default: 1
-
-    secrets:
-      gcloud-service-key:
-        required: true
-        description: Secret to access Bazel build cache
-
-    outputs:
-      docker-image:
-        value: ${{ jobs.build.outputs.docker-image }}
-        description: The docker image containing the built PyTorch.
-jobs:
-  build:
-    runs-on: ${{ inputs.runner }}
-    timeout-minutes: 240
-    outputs:
-      docker-image: ${{ steps.upload-docker-image.outputs.docker-image }}
-    env:
-      ECR_DOCKER_IMAGE_BASE: ${{ inputs.ecr-docker-image-base }}
-      GCR_DOCKER_IMAGE: ${{ inputs.gcr-docker-image }}
-      WORKDIR: /var/lib/jenkins/workspace
-      SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
-      XLA_CUDA: ${{ inputs.cuda }}
-      BAZEL_JOBS: 16
-    steps:
-      - name: Setup Linux
-        uses: pytorch/test-infra/.github/actions/setup-linux@main
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-          instructions: |
-            Build is  done inside the container, to start an interactive session run:
-              docker exec -it $(docker container ps --format '{{.ID}}') bash
-      - name: Checkout repo
-        uses: actions/checkout@v3
-      - name: Download docker image from GCR
-        shell: bash
-        run: docker pull "${GCR_DOCKER_IMAGE}"
-      - name: Stage image to ECR
-        shell: bash
-        run: |
-            # This is to stage PyTorch/XLA base image for use in the upstream.
-            # To allow the upstream workflow to access PyTorch/XLA build images, we
-            # need to have them in the ECR. This is not expensive, and only pushes it
-            # if image layers are not present in the repo.
-            # Note: disable the following 2 lines while testing a new image, so we do not
-            # push to the upstream.
-            docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
-            docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
-      - name: Start the container
-        shell: bash
-        run: |
-          pid=$(docker run --privileged -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}")
-          docker exec -u jenkins "${pid}" sudo chown -R jenkins "${WORKDIR}"
-          docker cp "${GITHUB_WORKSPACE}/." "$pid:$WORKDIR"
-          echo "pid=${pid}" >> "${GITHUB_ENV}"
-
-      - name: Prepare build env
-        shell: bash
-        run: |
-          echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env"
-          echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
-          echo "declare -x BAZEL_JOBS=${BAZEL_JOBS}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
-          echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
-          echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> default_credentials.json"
-
-      - name: Build
-        shell: bash
-        run: |
-          docker exec --privileged -u jenkins "${pid}" bash -c ".circleci/build.sh"
-      - name: Cleanup build env
-        shell: bash
-        run: |
-           docker exec "${pid}" rm default_credentials.json /tmp/pytorch/xla/default_credentials.json
-
-      - name: Push built docker image to ECR
-        id: upload-docker-image
-        shell: bash
-        run: |
-          export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:latest-${GITHUB_SHA}"
-          time docker commit "${pid}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-          echo "docker-image=${COMMIT_DOCKER_IMAGE}" >> "${GITHUB_OUTPUT}"
-
-      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
-
diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml
index 5f773971430..69b93fd5b81 100644
--- a/.github/workflows/_build_plugin.yml
+++ b/.github/workflows/_build_plugin.yml
@@ -25,7 +25,7 @@ jobs:
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
       GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
       BAZEL_JOBS: 16
-      BAZEL_REMOTE_CACHE: 1
+      BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
     steps:
       - name: Setup gcloud
         shell: bash
@@ -39,7 +39,7 @@ jobs:
         shell: bash
         run: |
           cd pytorch/xla/infra/ansible
-          ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda src_root=${GITHUB_WORKSPACE}" --skip-tags=fetch_srcs,install_deps
+          ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
new file mode 100644
index 00000000000..e9defd40eb5
--- /dev/null
+++ b/.github/workflows/_build_torch_with_cuda.yml
@@ -0,0 +1,55 @@
+name: build-torch-with-cuda
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      torch-commit:
+          required: true
+          type: string
+          description: torch-commit
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+jobs:
+  build:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "--gpus all --shm-size 16g"
+    env:
+      _GLIBCXX_USE_CXX11_ABI: 0
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Setup CUDA environment
+        shell: bash
+        run: |
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+      - name: Check GPU
+        run: nvidia-smi
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ inputs.torch-commit }}
+          submodules: recursive
+      - name: Build
+        shell: bash
+        run: |
+          cd pytorch
+          USE_CUDA=1 python setup.py bdist_wheel
+      - name: Upload wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-with-cuda
+          path: pytorch/dist/*.whl
diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml
index 969fb3b5dc9..56e6b5408c3 100644
--- a/.github/workflows/_build_torch_xla.yml
+++ b/.github/workflows/_build_torch_xla.yml
@@ -6,6 +6,10 @@ on:
         required: true
         type: string
         description: Base image for builds
+      torch-commit:
+          required: true
+          type: string
+          description: torch-commit
       runner:
         required: false
         type: string
@@ -25,8 +29,14 @@ jobs:
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
       GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
       BAZEL_JOBS: 16
-      BAZEL_REMOTE_CACHE: 1
+      BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+      BUILD_CPP_TESTS: 1
     steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
       - name: Setup gcloud
         shell: bash
         run: |
@@ -36,7 +46,8 @@ jobs:
         with:
           repository: pytorch/pytorch
           path: pytorch
-          # TODO: correct pin
+          ref: ${{ inputs.torch-commit }}
+          submodules: recursive
       - name: Checkout PyTorch/XLA Repo
         uses: actions/checkout@v4
         with:
@@ -45,9 +56,14 @@ jobs:
         shell: bash
         run: |
           cd pytorch/xla/infra/ansible
-          ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0" --skip-tags=fetch_srcs,install_deps
+          ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:
           name: torch-xla-wheels
           path: /dist/*.whl
+      - name: Upload CPP test binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: cpp-test-bin
+          path: /tmp/test/bin
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index ed9a4ab0ea9..378dec9697a 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -2,10 +2,10 @@ name: xla-docs-build
 on:
   workflow_call:
     inputs:
-      docker-image:
+      dev-image:
         required: true
         type: string
-        description: Image to build docs in
+        description: Base image for builds
       runner:
         required: false
         type: string
@@ -15,35 +15,57 @@ on:
       torchxla-bot-token:
         required: true
 jobs:
-  push-docs:
-    runs-on: ${{ inputs.runner }}
+  build-docs:
+    runs-on: ubuntu-latest
     timeout-minutes: 45
+    container:
+      image: ${{ inputs.dev-image }}
     env:
-      DOCKER_IMAGE: ${{ inputs.docker-image }}
-      WORKDIR: /var/lib/jenkins/workspace
+      BRANCH_NAME: ${{ github.ref_name }}
     steps:
-      - name: Setup Linux
-        uses: pytorch/test-infra/.github/actions/setup-linux@main
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-          instructions: |
-            Doc builds are done inside container. Interactive session can be started by following:
-              docker exec -it $(docker container ps --format '{{.ID}}') bash
-      - name: Download and run docker image from GCR
-        shell: bash
-        env:
-          GITHUB_TORCH_XLA_BOT_TOKEN: ${{ secrets. torchxla-bot-token }}
-        run: |
-          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
-          docker pull "${DOCKER_IMAGE}"
-          pid=$(docker run -e GITHUB_TORCH_XLA_BOT_TOKEN -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
-          echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
-          echo "pid=${pid}" >> "${GITHUB_ENV}"
-      - name: Build & publish docs
-        shell: bash
-        run: docker exec -u jenkins "${pid}" bash -c '.circleci/doc_push.sh'
-      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+    - name: Fetch wheels
+      uses: actions/download-artifact@v4
+      with:
+        name: torch-xla-wheels
+        path: /tmp/wheels/
+    - name: Install wheels
+      shell: bash
+      run: |
+        pip install /tmp/wheels/*.whl
+    - name: Checkout PyTorch/XLA Repo
+      uses: actions/checkout@v4
+      with:
+        path: pytorch/xla
+    - name: Build docs
+      shell: bash
+      run: |
+        cd pytorch/xla/docs
+        pip install -r requirements.txt
+        sphinx-build -b html source build
+    - name: Checkout GitHub Pages
+      uses: actions/checkout@v4
+      with:
+        path: gh-pages
+        ref: gh-pages
+        token: ${{ secrets.torchxla-bot-token }}
+    - name: Merge changes
+      shell: bash
+      run: |
+        subdir=${{ env.BRANCH_NAME == 'master' && 'master' || format('{0}/{1}', 'release', env.BRANCH_NAME) }}
+        mkdir -p gh-pages/$subdir
+        cp -fR pytorch/xla/docs/build/* gh-pages/$subdir
+    - name: Upload preview as artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: github-pages
+        path: pytorch/xla/docs/build/
+    - name: Deploy
+      shell: bash
+      run: |
+        cd gh-pages
+        git config user.email "pytorchxla@gmail.com"
+        git config user.name "torchxlabot2"
+        git add . -v
+        git diff --cached --exit-code || git commit -m "Update doc from commit ${{ github.sha }}"
+        git push origin gh-pages
+      if: github.event_name == 'push'
diff --git a/.github/workflows/_get_torch_commit.yml b/.github/workflows/_get_torch_commit.yml
new file mode 100644
index 00000000000..debaecd8194
--- /dev/null
+++ b/.github/workflows/_get_torch_commit.yml
@@ -0,0 +1,32 @@
+name: get-torch-commit
+on:
+  workflow_call:
+    outputs:
+      torch_commit:
+        description: "torch commit to be used"
+        value: ${{ jobs.get-commit.outputs.torch_commit }}
+
+jobs:
+  get-commit:
+    runs-on: ubuntu-20.04
+    outputs: 
+      torch_commit: ${{ steps.get_torch_commit.outputs.torch_commit }}
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          submodules: recursive
+      - id: get_torch_commit
+        name: Get torch commit
+        run: |
+          cd pytorch
+          torch_commit=$(git rev-parse HEAD) 
+          echo "torch_commit=$torch_commit" >> "$GITHUB_OUTPUT"
+      
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 0f9e96e31e5..8a454cc075b 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -2,10 +2,10 @@ name: xla-test
 on:
   workflow_call:
     inputs:
-      docker-image:
+      dev-image:
         required: true
         type: string
-        description: Image to test on
+        description: Base image for builds
       runner:
         required: false
         type: string
@@ -22,16 +22,12 @@ on:
         default: 270
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
-      disable-pjrt:
+            timeout-minutes:
+      install-cuda-plugin:
         required: false
-        type: string
-        default: 0
-        description: Whether to disable PJRT tests
-      test-script:
-        required: false
-        type: string
-        default: test.sh
-        description: Which test script to run
+        type: boolean
+        default: false
+        description: Whether to install CUDA plugin package
 
     secrets:
       gcloud-service-key:
@@ -40,14 +36,15 @@ on:
 jobs:
   test:
     runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g"
     strategy:
       fail-fast: false
       matrix:
         include:
           # Use readable strings as they define the workflow titles.
           - run_benchmark_tests: 'benchmark_tests'
-          - run_cpp_tests1: 'cpp_tests1'
-          - run_cpp_tests2: 'cpp_tests2'
           - run_python_tests: 'python_tests'
             run_xla_op_tests1: 'xla_op1'
           - run_python_tests: 'python_tests'
@@ -56,63 +53,112 @@ jobs:
             run_xla_op_tests3: 'xla_op3'
           - run_python_tests: 'python_tests'
             run_torch_mp_op_tests: 'torch_mp_op'
+          - run_cpp_tests: 'cpp_tests'
+            run_cpp_tests1: 'cpp_tests1'
+          - run_cpp_tests: 'cpp_tests'
+            run_cpp_tests2: 'cpp_tests2'
     timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
-      DOCKER_IMAGE: ${{ inputs.docker-image }}
-      WORKDIR: /var/lib/jenkins/workspace
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
       USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
-      XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
-      XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
       RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
-      RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
-      RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
       RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
       RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
       RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
       RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
       RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
+      RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
+      RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
     steps:
-      - name: Setup Linux
-        uses: pytorch/test-infra/.github/actions/setup-linux@main
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
+      - name: Fetch wheels
+        uses: actions/download-artifact@v4
         with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-          instructions: |
-            Tests are done inside the container, to start an interactive session run:
-              docker exec -it $(docker container ps --format '{{.ID}}') bash
-      - name: Install gcloud CLI
-        if: ${{ inputs.collect-coverage }}
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      - name: Fetch CPP test binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: cpp-test-bin
+          path: /tmp/test/bin
+        if: ${{ matrix.run_cpp_tests }}
+      # GitHub Actions doesn't preserve executable permissions
+      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
+      - name: Set CPP test permissions
+        run: |
+          chmod +x /tmp/test/bin/*
+          ls -l /tmp/test/bin
+        if: ${{ matrix.run_cpp_tests }}
+      - name: Fetch CUDA plugin
+        uses: actions/download-artifact@v4
+        with:
+          name: cuda-plugin
+          path: /tmp/wheels/
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Setup CUDA environment
         shell: bash
         run: |
-          sudo tee -a /etc/yum.repos.d/google-cloud-sdk.repo << EOM
-          [google-cloud-cli]
-          name=Google Cloud CLI
-          baseurl=https://packages.cloud.google.com/yum/repos/cloud-sdk-el8-x86_64
-          enabled=1
-          gpgcheck=1
-          repo_gpgcheck=0
-          gpgkey=https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
-          EOM
-          sudo yum install -y google-cloud-cli
-      - name: Auth to GCR
-        if: ${{ inputs.collect-coverage }}
+          # TODO: Make PJRT_DEVICE=CPU work with XLA_REGISTER_INSTALLED_PLUGINS=1
+          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
+
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Check GPU
+        run: nvidia-smi
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Install wheels
         shell: bash
         run: |
-          echo "${GCLOUD_SERVICE_KEY}" | gcloud auth activate-service-account --key-file=-
-      - name: Download and run docker image from GCR
+          pip install /tmp/wheels/*.whl
+          # TODO: Add these in setup.py
+          pip install fsspec
+          pip install rich
+
+          echo "Import check..."
+          python -c "import torch_xla"
+      - name: Record PyTorch commit
+        run: |
+          # Don't just pipe output in shell because imports may do extra logging
+          python -c "
+          import torch_xla.version
+          with open('$GITHUB_ENV', 'a') as f:
+            f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n')
+          "
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ env.PYTORCH_COMMIT }}
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Extra CI deps
         shell: bash
         run: |
-          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
-          docker pull "${DOCKER_IMAGE}"
-          pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -e RUN_BENCHMARK_TESTS -e RUN_CPP_TESTS1 -e RUN_CPP_TESTS2 -e RUN_PYTHON_TESTS -e RUN_XLA_OP_TESTS1 -e RUN_XLA_OP_TESTS2 -e RUN_XLA_OP_TESTS3 -e RUN_TORCH_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
-          echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
-          echo "pid=${pid}" >> "${GITHUB_ENV}"
+          set -x
+
+          pip install expecttest unittest-xml-reporting
+
+          if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then
+            pip install -r pytorch/xla/benchmarks/requirements.txt
+          fi
       - name: Test
         shell: bash
-        run: |
-          docker exec --privileged -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}'
+        run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash
@@ -158,8 +204,3 @@ jobs:
                 gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
               fi
             fi
-
-      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
-
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
new file mode 100644
index 00000000000..a3e265e557f
--- /dev/null
+++ b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -0,0 +1,110 @@
+name: xla-test-requiring-torch-cuda
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+      collect-coverage:
+        required: false
+        type: boolean
+        description: Set to true to collect coverage information
+        default: false
+      timeout-minutes:
+        required: false
+        type: number
+        default: 30
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+            timeout-minutes:
+
+jobs:
+  test:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "--gpus all --shm-size 16g"
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    env:
+      USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      # TODO: need to find a way to reuse these steps.
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Fetch torch/torch_xla/torchvision wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      - name: Remove torch wheel built with CUDA disabled
+        shell: bash
+        run: |
+          rm -rf /tmp/wheels/torch-*
+      - name: Fetch the torch wheel built with CUDA enabled
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-with-cuda
+          path: /tmp/wheels/
+      - name: Fetch CUDA plugin
+        uses: actions/download-artifact@v4
+        with:
+          name: cuda-plugin
+          path: /tmp/wheels/
+      - name: Setup CUDA environment
+        shell: bash
+        run: |
+          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
+
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+      - name: Check GPU
+        run: nvidia-smi
+      - name: Install wheels
+        shell: bash
+        run: |
+          pip install /tmp/wheels/*.whl
+          # TODO: Add these in setup.py
+          pip install fsspec
+          pip install rich
+
+          echo "Import check..."
+          python -c "import torch, torch_xla, torchvision"
+          echo "Import check done."
+          echo "Check if CUDA is available for PyTorch..."
+          python -c "import torch; assert torch.cuda.is_available()"
+          echo "CUDA is available for PyTorch."
+      - name: Record PyTorch commit
+        run: |
+          # Don't just pipe output in shell because imports may do extra logging
+          python -c "
+          import torch_xla.version
+          with open('$GITHUB_ENV', 'a') as f:
+            f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n')
+          "
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ env.PYTORCH_COMMIT }}
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Test
+        shell: bash
+        run: |
+          set -xue
+          PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
+          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 38203f57580..1a924f65036 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -19,38 +19,45 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build:
-    name: "Build PyTorch/XLA (GPU)"
-    uses: ./.github/workflows/_build.yml
-    with:
-      ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
-      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
-      cuda: 1
-    secrets:
-      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+
+  get-torch-commit:
+    name: "Get torch commit"
+    uses: ./.github/workflows/_get_torch_commit.yml
 
   build-torch-xla:
-    name: "Build PyTorch/XLA (TPU)"
+    name: "Build PyTorch/XLA"
     uses: ./.github/workflows/_build_torch_xla.yml
+    needs: get-torch-commit
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  build-torch-with-cuda:
+    name: "Build PyTorch with CUDA"
+    uses: ./.github/workflows/_build_torch_with_cuda.yml
+    needs: get-torch-commit
+    with:
+      # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+      runner: linux.8xlarge.nvidia.gpu
+
   build-cuda-plugin:
     name: "Build XLA CUDA plugin"
     uses: ./.github/workflows/_build_plugin.yml
     with:
-      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  test-cpu:
+  test-python-cpu:
     name: "CPU tests"
     uses: ./.github/workflows/_test.yml
-    needs: build
+    needs: build-torch-xla
     with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
       timeout-minutes: 120
       collect-coverage: false
     secrets:
@@ -59,28 +66,38 @@ jobs:
   test-cuda:
     name: "GPU tests"
     uses: ./.github/workflows/_test.yml
-    needs: build
+    needs: [build-torch-xla, build-cuda-plugin]
     with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
       runner: linux.8xlarge.nvidia.gpu
       timeout-minutes: 300
-      collect-coverage: false  # TODO(yeounoh) separate from CPU coverage metrics
+      collect-coverage: false
+      install-cuda-plugin: true
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests requiring torch CUDA"
+    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
+    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      runner: linux.8xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+
   test-tpu:
     name: "TPU tests"
     uses: ./.github/workflows/_tpu_ci.yml
     needs: build-torch-xla
     # Only run this for HEAD and releases
-    if: github.event_name == 'push'
+    if: github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'tpuci')
 
   push-docs:
-    name: "Build & publish docs"
-    if: github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || startsWith(github.event.ref, 'refs/tags/r'))
+    name: "Build docs"
     uses: ./.github/workflows/_docs.yml
-    needs: build
+    needs: build-torch-xla
     with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
     secrets:
       torchxla-bot-token: ${{ secrets.TORCH_XLA_BOT_TOKEN }}
diff --git a/.github/workflows/build_upstream_image.yml b/.github/workflows/build_upstream_image.yml
new file mode 100644
index 00000000000..446ad366e54
--- /dev/null
+++ b/.github/workflows/build_upstream_image.yml
@@ -0,0 +1,40 @@
+name: Build upstream image
+on:
+  push:
+    branches:
+      - master
+      - r[0-9]+.[0-9]+
+    paths-ignore:
+      - 'experimental/torch_xla2/**'
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: linux.12xlarge
+    timeout-minutes: 30
+    env:
+      ECR_DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
+      BAZEL_JOBS: 16
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          sudo rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Setup Linux
+        uses: pytorch/test-infra/.github/actions/setup-linux@main
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Build Docker image
+        shell: bash
+        run: |
+          docker build -t "${ECR_DOCKER_IMAGE_BASE}:v1.2-lite" .github/upstream
+      - name: Stage image to ECR
+        shell: bash
+        run: |
+            # This is to stage PyTorch/XLA base image for use in the upstream.
+            # To allow the upstream workflow to access PyTorch/XLA build images, we
+            # need to have them in the ECR. This is not expensive, and only pushes it
+            # if image layers are not present in the repo.
+            # Note: disable the following line while testing a new image, so we do not
+            # push to the upstream.
+            docker push "${ECR_DOCKER_IMAGE_BASE}:v1.2-lite"
diff --git a/.github/workflows/lintercheck.yml b/.github/workflows/lintercheck.yml
index 6598b98da32..b17c608f883 100644
--- a/.github/workflows/lintercheck.yml
+++ b/.github/workflows/lintercheck.yml
@@ -24,7 +24,7 @@ jobs:
         if: github.event_name == 'push' && github.event.ref == 'refs/heads/master'
         shell: bash
         run: |
-          TORCH_PIN=./torch_patches/.torch_pin
+          TORCH_PIN=./.torch_pin
           if [[ -f "${TORCH_PIN}" ]]; then
             echo "Please remove ${TORCH_PIN} before landing."
             exit 1
diff --git a/.github/workflows/torch_xla2.yml b/.github/workflows/torch_xla2.yml
index 7c5a88bf430..441addad422 100644
--- a/.github/workflows/torch_xla2.yml
+++ b/.github/workflows/torch_xla2.yml
@@ -34,10 +34,8 @@ jobs:
         shell: bash
         working-directory: experimental/torch_xla2
         run: |
-          pip install pytest absl-py jax[cpu] flatbuffers tensorflow
-          pip install torch --index-url https://download.pytorch.org/whl/cpu 
-          pip install -r test_requirements.txt
-          pip install -e .
+          pip install -r test-requirements.txt
+          pip install -e .[cpu]
       - name: Run tests
         working-directory: experimental/torch_xla2
         shell: bash
diff --git a/.kokoro/Dockerfile b/.kokoro/Dockerfile
index e85930b57d9..f04cd7d7874 100644
--- a/.kokoro/Dockerfile
+++ b/.kokoro/Dockerfile
@@ -41,7 +41,7 @@ RUN python setup.py install
 # install torch xla
 ADD ./ /pytorch/xla
 WORKDIR /pytorch/xla
-RUN cp keystore_content/77422_pytorch_tpu_cloud_build default_credentials.json
+# TODO: to reenable kokoro back, please request security key and save in default_credentials.json
 ARG SCCACHE="$(which sccache)"
 
 WORKDIR /pytorch/xla
diff --git a/.kokoro/presubmit.cfg b/.kokoro/presubmit.cfg
index 3f166e3557e..3b35cde7315 100644
--- a/.kokoro/presubmit.cfg
+++ b/.kokoro/presubmit.cfg
@@ -7,11 +7,5 @@ build_file: "xla/.kokoro/build_and_run_stablehlo_tests.sh"
 timeout_mins: 360
 
 before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 77422
-      keyname: "pytorch_tpu_cloud_build"
-      backend: "blade:keystore-fastconfigpush"
-    }
-  }
-}
\ No newline at end of file
+# TODO: to reenable kokoro, please setup how to fetch keysore here
+}
diff --git a/BUILD b/BUILD
index 6949f6dc748..5e1c90de84d 100644
--- a/BUILD
+++ b/BUILD
@@ -3,6 +3,21 @@ load(
     "if_cuda_is_configured",
 )
 
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--rebuild",
+    ],
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+    generate_hashes = True,
+)
+
 cc_binary(
     name = "_XLAC.so",
     copts = [
@@ -30,3 +45,23 @@ cc_binary(
         "@xla//xla/stream_executor:cuda_platform",
     ]),
 )
+
+test_suite(
+    name = "cpp_tests",
+    # testonly = True,
+    tests = [
+        "//test/cpp:test_aten_xla_tensor_1",
+        "//test/cpp:test_aten_xla_tensor_2",
+        "//test/cpp:test_aten_xla_tensor_3",
+        "//test/cpp:test_aten_xla_tensor_4",
+        "//test/cpp:test_aten_xla_tensor_5",
+        "//test/cpp:test_aten_xla_tensor_6",
+        "//test/cpp:test_ir",
+        "//test/cpp:test_lazy",
+        "//test/cpp:test_replication",
+        "//test/cpp:test_tensor",
+        "//test/cpp:test_xla_sharding",
+        "//torch_xla/csrc/runtime:pjrt_computation_client_test",
+        "//torch_xla/csrc/runtime:ifrt_computation_client_test",
+    ],
+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e4d0b161139..97c418a7eca 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,9 +6,58 @@ You are very welcome to pick issues from [good first issue](https://github.com/p
 If you plan to contribute new features, utility functions or extensions to the core, please first open an issue and discuss the feature with us.
 Sending a PR without discussion might end up resulting in a rejected PR, because we might be taking the core in a different direction than you might be aware of.
 
-## Building Manually
+## Building from source
 
-We recommend you to use our prebuilt Docker image to start your development work. If you want to use VSCode with docker, please refer to this [config](https://github.com/pytorch/xla/tree/master/.devcontainer/tpu-contributor).
+We recommend you to use our prebuilt Docker image to start your development work using one of the two following methods.
+
+### Visual Studio Code Dev Container
+
+* Create an empty directory (optionally on a remote host via SSH) and open it in VSCode. Then, clone PyTorch and PyTorch/XLA:
+
+  ```bash
+  git clone --recursive --depth=1 https://github.com/pytorch/pytorch.git
+  git clone https://github.com/pytorch/xla.git pytorch/xla
+  # Optional: use git@github.com:pytorch/xla.git instead if you prefer to use SSH with key forwarding
+  ```
+
+* Link (or copy) VSCode configuration to your workspace directory:
+
+  ```bash
+  ln -s pytorch/xla/.devcontainer/ .devcontainer
+  ln -s pytorch/xla/contrib/vscode/ .vscode
+  ln -s pytorch/xla/.style.yapf .style.yapf
+  ln -s pytorch/xla/.clang-format .clang-format
+  ```
+
+* From VSCode's command menu, run `Reopen in Container` to open your workspace in one of our pre-built Docker containers. Select the correct container config based on your local accelerator (default to `tpu-contributor` if you are not sure).
+
+* Since you are running as root in this container, change ownership of all files:
+
+  ```bash
+  chown -R root:root .
+  ```
+
+* Build PyTorch and PyTorch/XLA:
+
+  ```bash
+  cd pytorch/
+  # pytorch/xla requires pytorch wheel to be presented under pytorch/dist
+  python setup.py bdist_wheel
+  python setup.py install
+  cd xla/
+  python setup.py develop
+  # Optional: if you're using TPU, install libtpu
+  pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+  ```
+
+* Test your build
+
+  ```bash
+  python -c 'import torch_xla as xla; print(xla.device())'
+  # Output: xla:0
+  ```
+
+### Manually build in Docker container
 
 * Setup Development Docker Image
 
@@ -34,6 +83,8 @@ We recommend you to use our prebuilt Docker image to start your development work
 
 * Build PyTorch
   ```Shell
+  # pytorch/xla requires pytorch wheel to be presented under pytorch/dist
+  python setup.py bdist_wheel
   python setup.py develop
   ```
 * Build PyTorch/XLA
@@ -42,7 +93,7 @@ We recommend you to use our prebuilt Docker image to start your development work
   python setup.py develop
   ```
 
-### Build PyTorch/XLA from source with GPU support
+### Additional steps for GPU
 
 Please refer to this [guide](https://github.com/pytorch/xla/blob/master/docs/gpu.md#develop-pytorchxla-on-a-gpu-instance-build-pytorchxla-from-source-with-gpu-support).
 
diff --git a/OP_LOWERING_GUIDE.md b/OP_LOWERING_GUIDE.md
index b445a1d8998..535d7cf596c 100644
--- a/OP_LOWERING_GUIDE.md
+++ b/OP_LOWERING_GUIDE.md
@@ -25,7 +25,7 @@ All file mentioned below lives under the `xla/torch_xla/csrc` folder, with the e
 7. `ops/` directory contains all `ir::ops` declaration and definition. Smaller nodes can be put in `ops/ops.h/.cpp`. More complicated nodes can be put into a separate file. All ops inherit from `ir::ops::Node` and provide a way to lower input `ir::Value` to a sequence of `XlaOp`.
 
 ## Unit Test
-Our CircleCI runs PyTorch native python tests for every change and every day. Those tests will use XLA implementation if we provide a lowering. We usually don’t need to add additional python tests for PyTorch/XLA unless we want to verify some xla behaviors(like dynamic shape) or we skipped the pytorch native test for some reason. The python test should be added to `xla/test/test_operations.py` if it is required. We also need to add CPP tests in `xla/test/cpp/test_aten_xla_tensor.cpp`. This test should call PyTorch c++ API and verify our implementation yields the same result as PyTorch native implementation. We also need to verify if the xla implementation is called when the tensor is a XLA tensor by checking the `aten::op` and `xla::op` counters.
+Our CI runs PyTorch native python tests for every change and every day. Those tests will use XLA implementation if we provide a lowering. We usually don’t need to add additional python tests for PyTorch/XLA unless we want to verify some xla behaviors(like dynamic shape) or we skipped the pytorch native test for some reason. The python test should be added to `xla/test/test_operations.py` if it is required. We also need to add CPP tests in `xla/test/cpp/test_aten_xla_tensor.cpp`. This test should call PyTorch c++ API and verify our implementation yields the same result as PyTorch native implementation. We also need to verify if the xla implementation is called when the tensor is a XLA tensor by checking the `aten::op` and `xla::op` counters.
 
 ## Tips
 The process of lowering is breaking down the PyTorch operations into a sequence of XlaOp. To provide a good lowering of the PyTorch operation, one needs to have a good grasp of what XLA is capable of. Reading the XlaOp document and looking into how similar ops is lowered is the best way to achieve that. You can find a minimal Op lowering example in [this pr](https://github.com/pytorch/xla/pull/2969). You can also find a slightly more complicated example with backward lowering in [this pr](https://github.com/pytorch/xla/pull/2972).
diff --git a/README.md b/README.md
index d1653eb7b53..064297d0332 100644
--- a/README.md
+++ b/README.md
@@ -26,36 +26,34 @@ started:
 To install PyTorch/XLA a new TPU VM:
 
 ```
-pip install torch~=2.2.0 torch_xla[tpu]~=2.2.0 -f https://storage.googleapis.com/libtpu-releases/index.html
+pip install torch~=2.3.0 torch_xla[tpu]~=2.3.0 -f https://storage.googleapis.com/libtpu-releases/index.html
 ```
 
 To update your existing training loop, make the following changes:
 
 ```diff
 -import torch.multiprocessing as mp
++import torch_xla as xla
 +import torch_xla.core.xla_model as xm
-+import torch_xla.distributed.parallel_loader as pl
 +import torch_xla.distributed.xla_multiprocessing as xmp
 
  def _mp_fn(index):
    ...
 
 +  # Move the model paramters to your XLA device
-+  model.to(xm.xla_device())
-+
-+  # MpDeviceLoader preloads data to the XLA device
-+  xla_train_loader = pl.MpDeviceLoader(train_loader, xm.xla_device())
-
--  for inputs, labels in train_loader:
-+  for inputs, labels in xla_train_loader:
-     optimizer.zero_grad()
-     outputs = model(inputs)
-     loss = loss_fn(outputs, labels)
-     loss.backward()
--    optimizer.step()
-+
-+    # `xm.optimizer_step` combines gradients across replicas
-+    xm.optimizer_step()
++  model.to(xla.device())
+
+   for inputs, labels in train_loader:
++    with xla.step():
++      # Transfer data to the XLA device. This happens asynchronously.
++      inputs, labels = inputs.to(xla.device()), labels.to(xla.device())
+       optimizer.zero_grad()
+       outputs = model(inputs)
+       loss = loss_fn(outputs, labels)
+       loss.backward()
+-      optimizer.step()
++      # `xm.optimizer_step` combines gradients across replicas
++      xm.optimizer_step(optimizer)
 
  if __name__ == '__main__':
 -  mp.spawn(_mp_fn, args=(), nprocs=world_size)
@@ -69,8 +67,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 ```diff
  import torch.distributed as dist
 -import torch.multiprocessing as mp
-+import torch_xla.core.xla_model as xm
-+import torch_xla.distributed.parallel_loader as pl
++import torch_xla as xla
 +import torch_xla.distributed.xla_multiprocessing as xmp
 +import torch_xla.distributed.xla_backend
 
@@ -89,15 +86,15 @@ If you're using `DistributedDataParallel`, make the following changes:
 
 -  model = model.to(rank)
 -  ddp_model = DDP(model, device_ids=[rank])
-+  xla_train_loader = pl.MpDeviceLoader(train_loader, xm.xla_device())
 
--  for inputs, labels in train_loader:
-+  for inputs, labels in xla_train_loader:
-     optimizer.zero_grad()
-     outputs = ddp_model(inputs)
-     loss = loss_fn(outputs, labels)
-     loss.backward()
-     optimizer.step()
+   for inputs, labels in train_loader:
++    with xla.step():
++      inputs, labels = inputs.to(xla.device()), labels.to(xla.device())
+       optimizer.zero_grad()
+       outputs = ddp_model(inputs)
+       loss = loss_fn(outputs, labels)
+       loss.backward()
+       optimizer.step()
 
  if __name__ == '__main__':
 -  mp.spawn(_mp_fn, args=(), nprocs=world_size)
@@ -132,31 +129,36 @@ Our comprehensive user guides are available at:
 
 PyTorch/XLA releases starting with version r2.1 will be available on PyPI. You
 can now install the main build with `pip install torch_xla`. To also install the
-Cloud TPU plugin, install the optional `tpu` dependencies:
+Cloud TPU plugin, install the optional `tpu` dependencies after installing the main build with
 
 ```
 pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 ```
 
-GPU, XRT (legacy runtime), and nightly builds are available in our public GCS
-bucket.
+GPU and nightly builds are available in our public GCS bucket.
 
 | Version | Cloud TPU/GPU VMs Wheel |
 | --- | ----------- |
-| 2.2 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.2 (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
+| 2.3 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.3.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.3 (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
+| 2.3 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.3 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
 | nightly (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
 | nightly (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl` |
 | nightly (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
 
+You can also add `+yyyymmdd` after `torch_xla-nightly` to get the nightly wheel of a specified date. To get the companion pytorch nightly wheel, replace the `torch_xla` with `torch` on above wheel links.
+
 <details>
 
 <summary>older versions</summary>
 
 | Version | Cloud TPU VMs Wheel |
 |---------|-------------------|
+| 2.2 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.2 (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
+| 2.2 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.2 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
 | 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
 | 2.1 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.1.0-cp38-cp38-linux_x86_64.whl` |
 | 2.0 (Python 3.8) | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-2.0-cp38-cp38-linux_x86_64.whl` |
@@ -202,25 +204,29 @@ wheels for `torch` and `torch_xla` at
 | --- | ----------- |
 | 2.0 | `https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp310-cp310-linux_x86_64.whl` |
 
-You can also add `+yyyymmdd` after `torch_xla-nightly` to get the nightly wheel
-of a specified date. To get the companion pytorch and torchvision nightly wheel,
-replace the `torch_xla` with `torch` or `torchvision` on above wheel links.
 </details>
 
 ### Docker
 
 | Version | Cloud TPU VMs Docker |
 | --- | ----------- |
+| 2.3 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0_3.10_tpuvm` |
 | 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_tpuvm` |
 | 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_tpuvm` |
 | 2.0 | `gcr.io/tpu-pytorch/xla:r2.0_3.8_tpuvm` |
 | 1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.8_tpuvm` |
 | nightly python | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm` |
 
+To use the above dockers, please pass `--privileged --net host --shm-size=16G` along. Here is an example:
+```bash
+docker run --privileged --net host --shm-size=16G -it us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm /bin/bash
+```
+
 <br/>
 
 | Version | GPU CUDA 12.1 Docker |
 | --- | ----------- |
+| 2.3 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0_3.10_cuda_12.1` |
 | 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1` |
 | 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` |
 | nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` |
@@ -281,10 +287,10 @@ See the [contribution guide](CONTRIBUTING.md).
 
 ## Disclaimer
 
-This repository is jointly operated and maintained by Google, Facebook and a
+This repository is jointly operated and maintained by Google, Meta and a
 number of individual contributors listed in the
 [CONTRIBUTORS](https://github.com/pytorch/xla/graphs/contributors) file. For
-questions directed at Facebook, please send an email to opensource@fb.com. For
+questions directed at Meta, please send an email to opensource@fb.com. For
 questions directed at Google, please send an email to
 pytorch-xla@googlegroups.com. For all other questions, please open up an issue
 in this repository [here](https://github.com/pytorch/xla/issues).
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
index 8f04e683822..22f6d01e374 100644
--- a/TROUBLESHOOTING.md
+++ b/TROUBLESHOOTING.md
@@ -56,11 +56,7 @@ report sent to us if you have it.
 
 ## PyTorch/XLA Debugging Tool
 
-You can enable the PyTorch/XLA debugging tool by setting `PT_XLA_DEBUG=1`, which provides a couple useful debugging features.
-
-## PyTorch/XLA + Dynamo Debugging Tool
-
-You can enable the PyTorch/XLA + Dynamo debugging tool by setting `XLA_DYNAMO_DEBUG=1`.
+You can enable the PyTorch/XLA debugging tool by setting `PT_XLA_DEBUG_LEVEL=2`, which provides a couple useful debugging features. You can also lower the debug level to `1` to slip the execution analysis.
 
 ### Perform A Auto-Metrics Analysis
 
@@ -79,41 +75,44 @@ The debugging tool will analyze every compilation and execution for your model.
 ```
 Compilation Analysis: ================================================================================
 Compilation Analysis: Compilation Cause
-Compilation Analysis:   user mark_step
-Compilation Analysis: Graph Info:
-Compilation Analysis:   Graph Hash: 537d4b0264b029688281412214d252e9
-Compilation Analysis:   Number of Graph Inputs: 588
-Compilation Analysis:   Number of Graph Outputs: 320
-Compilation Analysis: Python Frame Triggered Execution:
-Compilation Analysis:   mark_step (/workspaces/dk2/pytorch/xla/torch_xla/core/xla_model.py:840)
-Compilation Analysis:   broadcast_master_param (/workspaces/dk2/pytorch/xla/torch_xla/core/xla_model.py:1230)
-Compilation Analysis:   train_imagenet (/workspaces/dk2/pytorch/xla/test/test_train_mp_imagenet.py:261)
-Compilation Analysis:   _mp_fn (/workspaces/dk2/pytorch/xla/test/test_train_mp_imagenet.py:365)
-Compilation Analysis:   __call__ (/workspaces/dk2/pytorch/xla/torch_xla/_internal/pjrt.py:176)
-Compilation Analysis:   _thread_fn (/workspaces/dk2/pytorch/xla/torch_xla/_internal/pjrt.py:70)
-Compilation Analysis:   run (/usr/local/lib/python3.8/concurrent/futures/thread.py:57)
-Compilation Analysis:   _worker (/usr/local/lib/python3.8/concurrent/futures/thread.py:80)
-Compilation Analysis:   ..........
+Compilation Analysis:   mark_step in parallel loader at step end
+Compilation Analysis: Graph Info: 
+Compilation Analysis:   Graph Hash: c74c3b91b855b2b123f833b0d5f86943
+Compilation Analysis:   Number of Graph Inputs: 35
+Compilation Analysis:   Number of Graph Outputs: 107
+Compilation Analysis: Python Frame Triggered Execution: 
+Compilation Analysis:   mark_step (/workspaces/dk3/pytorch/xla/torch_xla/core/xla_model.py:1055)
+Compilation Analysis:   next (/workspaces/dk3/pytorch/xla/torch_xla/distributed/parallel_loader.py:44)
+Compilation Analysis:   __next__ (/workspaces/dk3/pytorch/xla/torch_xla/distributed/parallel_loader.py:32)
+Compilation Analysis:   train_loop_fn (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:48)
+Compilation Analysis:   start_training (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:65)
+Compilation Analysis:   <module> (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:73)
 Compilation Analysis: --------------------------------------------------------------------------------
 Compilation Analysis: ================================================================================
 
+Post Compilation Analysis: ================================================================================
+Post Compilation Analysis: Graph input size: 1.548000 GB
+Post Compilation Analysis: Graph output size: 7.922460 GB
+Post Compilation Analysis: Aliased Input size: 1.547871 GB
+Post Compilation Analysis: Intermediate tensor size: 12.124478 GB
+Post Compilation Analysis: Compiled program size: 0.028210 GB
+Post Compilation Analysis: --------------------------------------------------------------------------------
+Post Compilation Analysis: ================================================================================
+
 Execution Analysis: ================================================================================
 Execution Analysis: Execution Cause
-Execution Analysis:   user mark_step
-Execution Analysis: Graph Info:
-Execution Analysis:   Graph Hash: 537d4b0264b029688281412214d252e9
-Execution Analysis:   Number of Graph Inputs: 588
-Execution Analysis:   Number of Graph Outputs: 320
-Execution Analysis: Python Frame Triggered Execution:
-Execution Analysis:   mark_step (/workspaces/dk2/pytorch/xla/torch_xla/core/xla_model.py:840)
-Execution Analysis:   broadcast_master_param (/workspaces/dk2/pytorch/xla/torch_xla/core/xla_model.py:1230)
-Execution Analysis:   train_imagenet (/workspaces/dk2/pytorch/xla/test/test_train_mp_imagenet.py:261)
-Execution Analysis:   _mp_fn (/workspaces/dk2/pytorch/xla/test/test_train_mp_imagenet.py:365)
-Execution Analysis:   __call__ (/workspaces/dk2/pytorch/xla/torch_xla/_internal/pjrt.py:176)
-Execution Analysis:   _thread_fn (/workspaces/dk2/pytorch/xla/torch_xla/_internal/pjrt.py:70)
-Execution Analysis:   run (/usr/local/lib/python3.8/concurrent/futures/thread.py:57)
-Execution Analysis:   _worker (/usr/local/lib/python3.8/concurrent/futures/thread.py:80)
-Execution Analysis:   ..........
+Execution Analysis:   mark_step in parallel loader at step end
+Execution Analysis: Graph Info: 
+Execution Analysis:   Graph Hash: c74c3b91b855b2b123f833b0d5f86943
+Execution Analysis:   Number of Graph Inputs: 35
+Execution Analysis:   Number of Graph Outputs: 107
+Execution Analysis: Python Frame Triggered Execution: 
+Execution Analysis:   mark_step (/workspaces/dk3/pytorch/xla/torch_xla/core/xla_model.py:1055)
+Execution Analysis:   next (/workspaces/dk3/pytorch/xla/torch_xla/distributed/parallel_loader.py:44)
+Execution Analysis:   __next__ (/workspaces/dk3/pytorch/xla/torch_xla/distributed/parallel_loader.py:32)
+Execution Analysis:   train_loop_fn (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:48)
+Execution Analysis:   start_training (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:65)
+Execution Analysis:   <module> (/workspaces/dk3/pytorch/xla/examples/train_decoder_only_base.py:73)
 Execution Analysis: --------------------------------------------------------------------------------
 Execution Analysis: ================================================================================
 ```
@@ -127,7 +126,7 @@ Some common causes of Compilation/Executation are
 
 The executation caused by 1-4 are expected, and we want to avoid 5 by either reduce the frequency of accessing tensor values or manually add a `mark_step` before accessing.
 
-Users should expect to see this `Compilation Cause` + `Executation Cause` pairs for first couple steps. After the model stabilize users should expect to only see `Execution Cause`. To use PyTorch/XLA efficiently, we expect the same models code to be run for every step and compilation only happen once for every graph. If you keep seeing `Compilation Cause`, you should try to dump the IR/HLO following [this section](#common-debugging-environment-variables-combinations) and compare the graphs for each step and understand the source of the differences.
+Users should expect to see this `Compilation Cause` + `Executation Cause` pairs for first couple steps. After the model stabilize users should expect to only see `Execution Cause`(you can disable execution analysis by `PT_XLA_DEBUG_LEVEL=1`). To use PyTorch/XLA efficiently, we expect the same models code to be run for every step and compilation only happen once for every graph. If you keep seeing `Compilation Cause`, you should try to dump the IR/HLO following [this section](#common-debugging-environment-variables-combinations) and compare the graphs for each step and understand the source of the differences.
 
 Following section will explain how to get and understand a more detail metrics report.
 
@@ -192,6 +191,10 @@ import torch_xla.debug.metrics as met
 met.clear_all()
 ```
 
+## PyTorch/XLA + Dynamo Debugging Tool
+
+You can enable the PyTorch/XLA + Dynamo debugging tool by setting `XLA_DYNAMO_DEBUG=1`.
+
 ## Performance Profiling
 To profile your workload in depth to understand bottlenecks please check the following resources:
 * [Official tutorial](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)
@@ -199,6 +202,9 @@ To profile your workload in depth to understand bottlenecks please check the fol
 * [Sample MNIST training script with profiling](https://github.com/pytorch/xla/blob/master/test/test_profile_mp_mnist.py)
 * [Utility script for capturing performance profiles](https://github.com/pytorch/xla/blob/master/scripts/capture_profile.py)
 
+## Simple Benchmarking
+Take a look at [`examples/train_resnet_benchmark.py`](https://github.com/pytorch/xla/blob/master/examples/train_resnet_benchmark.py) for how to benchmark a PyTorch/XLA model.
+
 ## Known Performance Caveats
 
 PyTorch/XLA behaves semantically like regular PyTorch and XLA tensors share the full tensor interface with CPU & GPU tensors.
@@ -331,25 +337,6 @@ only be enabled for debugging.
   by one. This is useful to bypass the long compilation time but overall step time will be a lot slower and memory usage will be higher
   since all compiler optimizaiton will be skipped.
 
-* ```XLA_USE_BF16```: If set to 1, transforms all the _PyTorch_ _Float_ values into _BiFloat16_
-  when sending to the _TPU_ device. Note that when using `XLA_USE_BF16=1` tensor arithmetic will
-  be done in reduced precision and so tensors will not be accurate if accumulated over time.
-  For example:
-
-  ```
-  # In reduced bfloat16 precision
-  >>> torch.tensor(4096, dtype=torch.bfloat16) + torch.tensor(1, dtype=torch.bfloat16)
-  tensor(4096., dtype=torch.bfloat16)
-  # Whereas in full float32 precision
-  >>> torch.tensor(4096) + torch.tensor(1)
-  tensor(4097)
-  ```
-  So to get accurate metrics such as average loss value over many steps, use manual mixed
-  precision where metrics stay in FP32.
-
-* ```XLA_USE_F16```: If set to 1, transforms all the _PyTorch_ _Float_ values into _Float16_
-  (_PyTorch_ _Half_ type) when sending to devices which supports them.
-
 * ```TF_CPP_LOG_THREAD_ID```: If set to 1, the TF logs will show the thread ID
   helping with debugging multithreaded processes.
 
diff --git a/WORKSPACE b/WORKSPACE
index e4d8a73fdc0..5d0f2a7c931 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -31,6 +31,17 @@ python_configure(
     name = "local_config_python",
     python_version = "3",  # required to use `python3-config`
 )
+
+################################ PyTorch Setup ################################
+
+load("//bazel:dependencies.bzl", "PYTORCH_LOCAL_DIR")
+
+new_local_repository(
+    name = "torch",
+    build_file = "//bazel:torch.BUILD",
+    path = PYTORCH_LOCAL_DIR,
+)
+
 ############################# OpenXLA Setup ###############################
 
 # To update OpenXLA to a new revision,
@@ -38,6 +49,9 @@ python_configure(
 # b) get the sha256 hash of the commit by running:
 #    curl -L https://github.com/openxla/xla/archive/<git hash>.tar.gz | sha256sum
 #    and update the sha256 with the result.
+
+xla_hash = '98db3e8c8f64dede911fd97605f76aaf6ede1153'
+
 http_archive(
     name = "xla",
     patch_args = [
@@ -50,12 +64,42 @@ http_archive(
         "//openxla_patches:gpu_race_condition.diff",
         "//openxla_patches:f16_abi_clang.diff",
     ],
-    strip_prefix = "xla-1acf05ef0d41181caaf0cd691aa9d453ffc41a73",
+    strip_prefix = "xla-" + xla_hash,
     urls = [
-        "https://github.com/openxla/xla/archive/1acf05ef0d41181caaf0cd691aa9d453ffc41a73.tar.gz",
+        "https://github.com/openxla/xla/archive/" + xla_hash + ".tar.gz",
     ],
 )
 
+# Initialize hermetic Python
+load("@xla//third_party/py:python_init_rules.bzl", "python_init_rules")
+
+python_init_rules()
+
+load("@xla//third_party/py:python_init_repositories.bzl", "python_init_repositories")
+
+python_init_repositories(
+    requirements = {
+        "3.8": "//:requirements_lock_3_8.txt",
+        "3.9": "//:requirements_lock_3_9.txt",
+        "3.10": "//:requirements_lock_3_10.txt",
+        "3.11": "//:requirements_lock_3_11.txt",
+    },
+    local_wheel_workspaces = ["@torch//:WORKSPACE"],
+    default_python_version = "system",
+)
+
+load("@xla//third_party/py:python_init_toolchains.bzl", "python_init_toolchains")
+
+python_init_toolchains()
+
+load("@xla//third_party/py:python_init_pip.bzl", "python_init_pip")
+
+python_init_pip()
+
+load("@pypi//:requirements.bzl", "install_deps")
+
+install_deps()
+
 # For development, one often wants to make changes to the OpenXLA repository as well
 # as the PyTorch/XLA repository. You can override the pinned repository above with a
 # local checkout by either:
@@ -90,12 +134,3 @@ load("@xla//:workspace0.bzl", "xla_workspace0")
 
 xla_workspace0()
 
-################################ PyTorch Setup ################################
-
-load("//bazel:dependencies.bzl", "PYTORCH_LOCAL_DIR")
-
-new_local_repository(
-    name = "torch",
-    build_file = "//bazel:torch.BUILD",
-    path = PYTORCH_LOCAL_DIR,
-)
diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index fb5227a6972..26a56c49f24 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -22,6 +22,7 @@ def list_experiment_configs(self):
         "xla": [None, "PJRT", "XRT"],
         "xla_flags": [None],
         "dynamo": [None, "inductor", "openxla_eval", "openxla"],
+        "torch_xla2": [None],  # options only apply to torch_xla2
         "test": ["eval", "train"],
     }
 
@@ -30,6 +31,9 @@ def list_experiment_configs(self):
       config_choices["accelerator"] = list(set(self._args.accelerator))
     if self._args.xla:
       config_choices["xla"] = list(map(parse_none_str, set(self._args.xla)))
+    if self._args.torch_xla2:
+      config_choices["torch_xla2"] = list(
+          map(parse_none_str, set(self._args.torch_xla2)))
     if self._args.dynamo:
       config_choices["dynamo"] = list(
           map(parse_none_str, set(self._args.dynamo)))
@@ -66,12 +70,17 @@ def _is_available(self, experiment_config):
     cfg_accelerator = experiment_config["accelerator"]
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
+    cfg_torch_xla2 = experiment_config["torch_xla2"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
         exclude_tags=()):
       return False
 
+    # torch_xla2 doesn't support dynamo at this time.
+    if cfg_dynamo is not None and cfg_torch_xla2:
+      return False
+
     # Check dynamo backend-specifics constraints.
     if cfg_dynamo == "inductor":
       if cfg_accelerator == "tpu" or cfg_xla is not None:
@@ -110,22 +119,26 @@ def load_experiment(self, experiment_config):
     dynamo = experiment_config["dynamo"]
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
+    torch_xla2 = experiment_config["torch_xla2"]
     return BenchmarkExperiment(
         accelerator=accelerator,
         xla=xla,
         xla_flags=xla_flags,
         dynamo=dynamo,
+        torch_xla2=torch_xla2,
         test=test,
         batch_size=batch_size)
 
 
 class BenchmarkExperiment:
 
-  def __init__(self, accelerator, xla, xla_flags, dynamo, test, batch_size):
+  def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2, test,
+               batch_size):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
     self.dynamo = dynamo
+    self.torch_xla2 = torch_xla2
     self.test = test
     self.batch_size = batch_size
     self.accelerator_model = get_accelerator_model(self.accelerator)
@@ -138,6 +151,9 @@ def update_process_env(self, process_env):
       process_env.pop("XRT_TPU_CONFIG", None)
       process_env.pop("XLA_FLAGS", None)
 
+    if self.torch_xla2:
+      process_env["JAX_PLATFORMS"] = self.accelerator.lower()
+
     if self.xla == "PJRT":
       process_env["PJRT_DEVICE"] = self.accelerator.upper()
     elif self.xla == "XRT":
@@ -185,6 +201,7 @@ def to_dict(self):
     d["xla"] = self.xla
     d["xla_flags"] = self.xla_flags
     d["dynamo"] = self.dynamo
+    d["torch_xla2"] = self.torch_xla2
     d["test"] = self.test
     d["batch_size"] = self.batch_size
     return d
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 59a430d3982..62b53e126a0 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 from torch._dynamo.testing import collect_results
+from torch.utils import _pytree as pytree
 from util import cast_to_dtype, move_to_device
 
 logger = logging.getLogger(__name__)
@@ -110,14 +111,10 @@ def conversion_dtype(self):
   def prepare_for_experiment(self, dynamo_compilation_opts):
     self.device = self.benchmark_experiment.get_device()
     self.dtype = self.conversion_dtype()
-
     if self.dtype is not None:
       self.module = self.module.to(self.dtype)
       self.example_inputs = cast_to_dtype(self.example_inputs, self.dtype)
 
-    self.module = self.module.to(self.device)
-    self.example_inputs = move_to_device(self.example_inputs, self.device)
-
     if self.benchmark_experiment.test == "eval":
       self._prepare_for_eval()
     elif self.benchmark_experiment.test == "train":
@@ -125,6 +122,32 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
     else:
       raise NotImplementedError
 
+    if self.benchmark_experiment.torch_xla2:
+      import torch_xla2.export
+      import torch_xla2
+      import jax
+      import jax.numpy as jnp
+      device = jax.devices()[0]
+      if self.benchmark_experiment.torch_xla2 == 'torch_export':
+        # for torch_xla2, we export model to FX graph and move weights to JAX device
+        exported = torch.export.export(self.module, self.example_inputs)
+        weights, jax_func = torch_xla2.export.exported_program_to_jax(exported)
+      elif self.benchmark_experiment.torch_xla2 == 'extract_jax':
+        weights, jax_func = torch_xla2.extract_jax(self.module)
+      else:
+        raise ValueError("torch_xla2 option unavailable")
+      weights = pytree.tree_map_only(jnp.ndarray,
+                                     lambda x: jax.device_put(x, device),
+                                     weights)
+      jax_func = jax.jit(jax_func)
+      self.module = lambda *x: jax_func(weights, x)
+      self.example_inputs = move_to_device(self.example_inputs, device,
+                                           self.benchmark_experiment.torch_xla2)
+    else:
+      self.module = self.module.to(self.device)
+      self.example_inputs = move_to_device(self.example_inputs, self.device,
+                                           self.benchmark_experiment.torch_xla2)
+
     if self.benchmark_experiment.dynamo:
       compilation_opts = dynamo_compilation_opts.copy()
       compilation_opts['backend'] = self.benchmark_experiment.dynamo
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index 443a4067ac1..11719b476ec 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -103,6 +103,7 @@ def generate_and_run_all_configs(self):
         # TODO: See if we can pass experiment_cfg to `load_experiment`.
         benchmark_experiment = self.experiment_loader.load_experiment(
             experiment_cfg)
+
         benchmark_model = self.model_loader.load_model(
             model_cfg, benchmark_experiment, dummy=True)
 
@@ -221,7 +222,7 @@ def _default_iter_fn(self, benchmark_experiment, benchmark_model,
       tracing_time = time.perf_counter() - t_trace_start
 
     # Mark step.
-    self._mark_step(benchmark_experiment)
+    self._mark_step(benchmark_experiment, output)
     total_time = time.perf_counter() - total_time_start
     return output, total_time, tracing_time
 
@@ -284,7 +285,10 @@ def run_once_and_gather_metrics(self, benchmark_experiment, benchmark_model,
 
     # Reset state and sync.
     reset_rng_state(benchmark_experiment)
-    self._mark_step(benchmark_experiment)
+    if benchmark_experiment.torch_xla2:
+      self._mark_step(benchmark_experiment, inputs_list)
+    else:
+      self._mark_step(benchmark_experiment)
     self._synchronize(benchmark_experiment)
     met.clear_all()
     dynamo_utils.counters.clear()
@@ -307,7 +311,7 @@ def loop(pytorch_profile=None, iter_fn=None):
           total_timing += timing
 
         # Mark step.
-        self._mark_step(benchmark_experiment)
+        self._mark_step(benchmark_experiment, output)
         if pytorch_profile is not None:
           pytorch_profile.step()
 
@@ -319,8 +323,8 @@ def loop(pytorch_profile=None, iter_fn=None):
         self._args.profile_cuda_cpu or \
         self._args.profile_cuda_cpu_individual_ops
     enable_xla_profiling = self._args.profile_xla
-    assert not (enable_pytorch_profiling and enable_pytorch_profiling
-               ), "More than one profiling path enabled."
+    assert not (enable_pytorch_profiling and
+                enable_xla_profiling), "More than one profiling path enabled."
 
     if enable_xla_profiling:
       logdir = self._get_results_dir_path(experiment_config, model_config,
@@ -414,9 +418,14 @@ def _prepare_inputs(self, example_inputs, should_randomize_input):
       inputs_list.append(inputs)
     return inputs_list
 
-  def _mark_step(self, benchmark_experiment):
+  def _mark_step(self, benchmark_experiment, tensors_to_check=None):
     if benchmark_experiment.xla:
-      xm.mark_step()
+      if benchmark_experiment.torch_xla2:
+        assert tensors_to_check is not None, "torch_xla2 requires input tensor to block_until_ready"
+        import jax
+        jax.block_until_ready(tensors_to_check)
+      else:
+        xm.mark_step()
 
   def _synchronize(self, benchmark_experiment):
     if benchmark_experiment.xla:
@@ -859,6 +868,12 @@ def __str__(self):
       action="append",
       help="Flags to forward to XLA via `XLA_FLAGS` env var.",
   )
+  parser.add_argument(
+      "--torch-xla2",
+      choices=["extract_jax", "torch_export"],
+      action="append",
+      help="Choose to use torch_xla2 and which mode to use.",
+  )
   parser.add_argument(
       "--disable-tf32",
       action="store_true",
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 00000000000..14e2549fec3
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,3 @@
+tabulate
+scipy
+pandas
diff --git a/benchmarks/result_analyzer.py b/benchmarks/result_analyzer.py
index 358edc4230a..5b0cf7f4cd3 100644
--- a/benchmarks/result_analyzer.py
+++ b/benchmarks/result_analyzer.py
@@ -55,6 +55,7 @@ def run_csv(self):
         "xla": pd.Series(dtype="str"),
         "xla_flags": pd.Series(dtype="str"),
         "dynamo": pd.Series(dtype="str"),
+        "torch_xla2": pd.Series(dtype="str"),
         "test": pd.Series(dtype="str"),
         "batch_size": pd.Series(dtype="int"),
         "repeat": pd.Series(dtype="int"),
@@ -116,6 +117,8 @@ def extract_metrics_jsonl(self, file):
       xla_value = "None" if xla is None else xla
       dynamo = dataline["experiment"]["dynamo"]
       dynamo_value = "None" if dynamo is None else dynamo
+      torch_xla2 = dataline["experiment"]["torch_xla2"]
+      torch_xla2_value = "None" if torch_xla2 is None else torch_xla2
       test = dataline["experiment"]["test"]
       test_value = "None" if test is None else test
       outputs_file = dataline["experiment"].get("outputs_file", None)
@@ -135,6 +138,7 @@ def extract_metrics_jsonl(self, file):
               "accelerator_model": dataline["experiment"]["accelerator_model"],
               "xla": xla_value,
               "dynamo": dynamo_value,
+              "torch_xla2": torch_xla2_value,
               "test": test_value,
               "outputs_file": outputs_file_value
           }
@@ -175,6 +179,7 @@ def extract_metrics_csv(self, file, metric_df):
           "xla": dataline["experiment"]["xla"],
           "xla_flags": dataline["experiment"]["xla_flags"],
           "dynamo": dataline["experiment"]["dynamo"],
+          "torch_xla2": dataline["experiment"]["torch_xla2"],
           "test": dataline["experiment"]["test"],
           "batch_size": dataline["experiment"]["batch_size"],
           "repeat": dataline["repeat"],
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
index fd8a055bccc..e4e483947d9 100644
--- a/benchmarks/run_benchmark.sh
+++ b/benchmarks/run_benchmark.sh
@@ -5,7 +5,7 @@ LOGFILE=/tmp/benchmark_test.log
 
 # Note [Keep Going]
 #
-# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CircleCI tests continue on error.
+# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CI tests continue on error.
 # This will allow you to see all the failures on your PR, not stopping with the first
 # test failure like the default behavior.
 CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 741463ec65f..838a5014187 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -224,7 +224,7 @@ def _cleanup(self):
     gc.collect()
 
     # If we are using CUDA, clean-up its cache left-over.
-    if self.benchmark_experiment.accelerator == "cuda":
+    if self.is_accelerator_cuda():
       torch.cuda.empty_cache()
 
   def set_up(self):
@@ -246,14 +246,16 @@ def set_up(self):
     benchmark = self.load_benchmark()
 
     self.module, self.example_inputs = benchmark.get_module()
-
+    if isinstance(self.example_inputs,
+                  dict) and "input_ids" in self.example_inputs:
+      self.example_inputs = (self.example_inputs['input_ids'],)
     self.benchmark_experiment.batch_size = benchmark.batch_size
 
     # Move the initialized model to XLA device.
     if self.benchmark_experiment.xla:
       # First, move the model and the inputs to CPU.
       # This avoids having dupplicated data on CUDA.
-      if self.benchmark_experiment.accelerator == "cuda":
+      if self.is_accelerator_cuda():
         self.module = self.module.to("cpu")
         self.example_inputs = move_to_device(self.example_inputs, "cpu")
         self._cleanup()
@@ -305,8 +307,9 @@ def load_benchmark(self):
     # torch.backends.__allow_nonbracketed_mutation_flag = True
 
     # torchbench uses `xla` as device instead of `tpu`
-    if (device := self.benchmark_experiment.accelerator) == 'tpu':
-      device = str(self.benchmark_experiment.get_device())
+    device = (
+        str(self.benchmark_experiment.get_device())
+        if self.is_accelerator_tpu() else self.benchmark_experiment.accelerator)
 
     return self.benchmark_cls()(
         test=self.benchmark_experiment.test,
@@ -330,6 +333,12 @@ def is_inference(self):
   def is_training(self):
     return self.benchmark_experiment.test == "train"
 
+  def is_accelerator_cuda(self):
+    return self.benchmark_experiment.accelerator == "cuda"
+
+  def is_accelerator_tpu(self):
+    return self.benchmark_experiment.accelerator == "tpu"
+
   def use_amp(self):
     return self.is_training(
     ) or self.model_name in FORCE_AMP_FOR_FP16_BF16_MODELS
@@ -350,23 +359,27 @@ def conversion_dtype(self):
 
   def _get_autocast_with_kwargs(self):
     kwargs = {}
-
     if self.use_amp():
-      # Set the default data-type based on the accelerator.
-      if self.benchmark_experiment.accelerator == "cuda":
+      # TODO: Should call device specific autocast implementations.
+      # Specifically, we should be using:
+      #   - torch.cuda.amp.autocast for inductor
+      #   - torch_xla.amp.autocast for PyTorch/XLA experiments.
+      # PyTorch/XLA autocast does not run with dynamo, though:
+      # https://github.com/pytorch/xla/issues/6511
+      if self.is_accelerator_cuda():
+        # For inductor and XLA:CUDA, we use CUDA autocast.
+        autocast = torch.cuda.amp.autocast
         kwargs["dtype"] = torch.float16
-      else:
-        # Both CPU and TPU autocast mode defaults to bfloat16.
-        kwargs["dtype"] = torch.bfloat16
-
-      if self.benchmark_experiment.xla:
-        # Should call device specific autocast implementations.
-        # PyTorch/XLA autocast does not run with dynamo, though:
-        # https://github.com/pytorch/xla/issues/6511
+      elif self.is_accelerator_tpu():
         autocast = torch.amp.autocast
         kwargs["device_type"] = "xla"
+        kwargs["dtype"] = torch.bfloat16
       else:
-        autocast = torch.cuda.amp.autocast
+        # Error: AMP is only supported on XLA:CUDA and XLA:TPU.
+        name = self.model_name
+        accelerator = self.benchmark_experiment.accelerator
+        raise RuntimeError(f"Tried to run {name} with AMP on {accelerator}. "
+                           "However, AMP is only supported on cuda and tpu.")
     else:
       autocast = contextlib.nullcontext
     return (autocast, kwargs)
diff --git a/benchmarks/util.py b/benchmarks/util.py
index ce56ceb4143..8ab1d5a0181 100644
--- a/benchmarks/util.py
+++ b/benchmarks/util.py
@@ -50,7 +50,7 @@ def reset_rng_state(benchmark_experiment=None):
   torch.manual_seed(1337)
   random.seed(1337)
   np.random.seed(1337)
-  if benchmark_experiment is not None and benchmark_experiment.xla is not None:
+  if benchmark_experiment is not None and benchmark_experiment.xla is not None and benchmark_experiment.torch_xla2 is not None:
     device = benchmark_experiment.get_device()
     xm.set_rng_state(1337, str(device))
 
@@ -76,8 +76,15 @@ def is_xla_device_available(devkind):
   return r.returncode == 0
 
 
-def move_to_device(item, device):
-  return pytree.tree_map_only(torch.Tensor, lambda t: t.to(device), item)
+def move_to_device(item, device, torch_xla2=False):
+  if torch_xla2:
+    import torch_xla2
+    import jax
+    move_to_device_func = lambda t: jax.device_put(
+        torch_xla2.tensor.t2j(t), device)
+  else:
+    move_to_device_func = lambda t: t.to(device)
+  return pytree.tree_map_only(torch.Tensor, move_to_device_func, item)
 
 
 def cast_to_dtype(item, dtype):
diff --git a/build_util.py b/build_util.py
index 78e4bd5e453..487f5116323 100644
--- a/build_util.py
+++ b/build_util.py
@@ -36,10 +36,6 @@ def bazel_options_from_env() -> Iterable[str]:
     bazel_flags.append('--remote_default_exec_properties=cache-silo-key=%s' %
                        cache_silo_name)
 
-  if check_env_flag('BUILD_CPP_TESTS', default='0'):
-    bazel_flags.append('//test/cpp:all')
-    bazel_flags.append('//torch_xla/csrc/runtime:all')
-
   bazel_jobs = os.getenv('BAZEL_JOBS', default='')
   if bazel_jobs:
     bazel_flags.append('--jobs=%s' % bazel_jobs)
diff --git a/codegen/BUILD b/codegen/BUILD
index ab7bbe4c3af..79ae421f920 100644
--- a/codegen/BUILD
+++ b/codegen/BUILD
@@ -8,6 +8,10 @@ py_binary(
         "//torch_xla/csrc:aten_xla_type.cpp",
         "@torch//:torchgen_deps",
     ],
+    deps = [
+        "@pypi_torch//:pkg",
+        "@pypi_pyyaml//:pkg",
+    ],
     tags = [
         "local",
         "no-remote-exec",
diff --git a/codegen/xla_native_functions.yaml b/codegen/xla_native_functions.yaml
index 199025dc7e1..de5500a0c5b 100644
--- a/codegen/xla_native_functions.yaml
+++ b/codegen/xla_native_functions.yaml
@@ -361,6 +361,7 @@ supported:
   - zero_
   - _native_batch_norm_legit
   - _native_batch_norm_legit.no_stats
+  - _embedding_bag_forward_only
   # Note: [functionalization and CompositeExplicitAutograd]
   # Below are all operators that are "composite" in core,
   # but require us to explicitly re-enable functionalization in order to use them.
diff --git a/configuration.yaml b/configuration.yaml
index a2bb9da6de7..1ea15dca81a 100644
--- a/configuration.yaml
+++ b/configuration.yaml
@@ -122,27 +122,6 @@ variables:
           XLANativeFunctions::_copy_from.
       type: bool
       default_value: true
-    XLA_USE_BF16:
-      description:
-        - Tensor arithmetic will be done in reduced precision and so tensors
-          will not be accurate if accumulated over time.
-      type: bool
-      default_value: false
-    XLA_USE_F16:
-      description:
-        - If set to true, transforms all the PyTorch Float values into Float16
-          (PyTorch Half type) when sending to devices which supports them.
-      type: bool
-      default_value: false
-    XLA_USE_32BIT_LONG:
-      description:
-        - If set to true, maps PyTorch Long types to XLA 32bit type. On the
-          versions of the TPU HW at the time of writing, 64bit integer
-          computations are expensive, so setting this flag might help. It
-          should be verified by the user that truncating to 32bit values is a
-          valid operation according to the use of PyTorch Long values in it.
-      type: bool
-      default_value: false
     XLA_IO_THREAD_POOL_SIZE:
       description:
         - Number of threads for the IO thread pool in the XLA client. Defaults
diff --git a/contrib/vscode/settings.json b/contrib/vscode/settings.json
index 59b86e622e7..b09fba227e6 100644
--- a/contrib/vscode/settings.json
+++ b/contrib/vscode/settings.json
@@ -14,9 +14,19 @@
     "coverage-gutters.coverageFileNames": [
         "./bazel-out/_coverage/_coverage_report.dat"
     ],
-    "lcov.path": [
-        "./bazel-out/_coverage/_coverage_report.dat"
+    "git.detectSubmodules": false,
+    "[python]": {
+        "editor.defaultFormatter": "eeyore.yapf",
+        "editor.formatOnSave": true,
+    },
+    "python.analysis.exclude": [
+        "**/third_party",
+        "**/build",
+        "**/__pycache__",
+        "**/.git",
     ],
-    "python.formatting.provider": "yapf",
-    "editor.formatOnSave": true
+    "[cpp]": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "editor.formatOnSave": true,
+    }
 }
diff --git a/docs/README.md b/docs/README.md
index 33a0ce5bc36..88ab7f44f03 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,25 +1,25 @@
 ## Publish documentation for a new release.
 
-CircleCI job `pytorch_xla_linux_debian11_and_push_doc` is specified to run on `release/*` branches, but it was not
+CI job `pytorch_xla_linux_debian11_and_push_doc` is specified to run on `release/*` branches, but it was not
 run on release branches due to "Only build pull requests" setting. Turning off "Only build pull requests" will result
 in much larger volumes in jobs which is often unnecessary. We're waiting for [this feature request](https://ideas.circleci.com/ideas/CCI-I-215)
 to be implemented so that we could override this setting on some branches.
 
 Before the feature is available on CircleCi side, we'll use a manual process to publish documentation for release.
-[Documentation for master branch](http://pytorch.org/xla/master/) is still updated automatically by the CircleCI job.
+[Documentation for master branch](http://pytorch.org/xla/master/) is still updated automatically by the CI job.
 But we'll need to manually commit the new versioned doc and point http://pytorch.org/xla to the documentation of new
 stable release.
 
-Take 1.5 release as example:
+Take 2.3 release as example:
 ```
-# Build pytorch/pytorch:release/1.5 and pytorch/xla:release/1.5 respectively.
+# Build pytorch/pytorch:release/2.3 and pytorch/xla:release/2.3 respectively.
 # In pytorch/xla/docs
 ./docs_build.sh
 git clone -b gh-pages https://github.com/pytorch/xla.git /tmp/xla
-cp -r build/* /tmp/xla/release/1.5
+cp -r build/* /tmp/xla/release/2.3
 cd /tmp/xla
 # Update `redirect_url` in index.md
 git add .
-git commit -m "Publish 1.5 documentation."
+git commit -m "Publish 2.3 documentation."
 git push origin gh-pages
-```
\ No newline at end of file
+```
diff --git a/docs/assets/ci_test_dependency.png b/docs/assets/ci_test_dependency.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4b2c397ba0d3b42ac7c665bb39d21af96dd545c
GIT binary patch
literal 44721
zcmce;byQT{8$UX9mvjzF2-1RdBOu+44AR}*NJ*DSg9rlB-QAti-5}izG2G+({=V<M
zcisQ*T4$|shO^Ip&VHW#)aSDeS5}n9LMKB9fk0R?AKt5iK=3Ib5ZoauJn+jlmX{6i
z0q3kLEe<Lhqc{M8s6jIC#ne6Ej#@oE$Y)xgPemS-U{F}+>4C3D+oRvaR7KqcGCaY+
z&bK=K?+$DsVK-A#!&6mkIid?P2#h@sY4eJk^D))&J?B(Q7i~k$JCaq*IuG27?}|<I
z_WCE|`|)KoDfH?D2hTZ`oOVYRc7@U|`S+Z^$$V-630J^t!<B$EK}-RJxWC{TU$OA8
za{u$K1Kkj*0v?mNgSsCX?f=}-f&N7s4$}!wL~xm)_n*K29gNI}%=h1r_rUkYFJLOW
zFe702zt0Y$2%-R<|L+rDX@Y|EdvI{-B`W`$S}aB^=KuUz8xD^tK#X-WENm&~Kgj?Y
zsQdpnU%(f_4r=1_?(Spd|HLwkGmbO<-+ZDy=!S8TcN-gS4F8G6KEN^X-+a_YNbCcW
zk7o`71pkRevQ4u6-+Z{0@Fd%>{5%>cBmNT$(+Sh*zxn=K2JHVImO-l#p1VNHx~}+Y
zRwok)3`p6*PpEuQs$0FXdlgY;)cMcqd{Ke}k6Mko1@&&cB#k(aEL-hF>O;2?Lt^MV
zeMuTLlk;xLm#Y~`u0b`PeV|t>y!?u}@_8mVVAiIdUr1~GX;YvV-6kKNC`{VA&fYBM
za0KHYwNbxCvQaGW{G+{>9-I4ji5&^pwJBd!RT{@dmb{3)9PUmodx2DA*1RMQq2r0P
z$7hNoK+-pmeV+-e7qDUh0|bDmT-|g(-MUkakbrv`($o%0jT&_@o#Hh%ji8@4oa{2-
ztNDlf=wj6R042x4zw(;=S6{ExMBiUyNWp80wxEC6m+Yn{)}dovt%pw*R6+kz=qK}=
zWy_@kKD|8PividFX2Vr^s)8o5jo~rHF(AFa`tj+noJzm#2?}Toklw4ef6+402gLe+
zkNErvF#qy@CzjL<kGw7Ipi}w^|EdV`!gZLJvZP~|iTVdI*8f%nc#MtTPNZ`WI;|`m
zUaJL%sA9FBk0DZ-2hxk{I(t8UU?4YyB7;Vg8W40}5+g`o+gdFO4Au^0GJxB#7gn){
z?dH+^xHol1>})hd;uin=6vjgST<!M3Kk6f-Cg`InKG^@>o+_T3!kJQ{Xf=pfi|5)Y
zmB$E-qp<@QgcHtF{C&fp%28{_KM9<XOT)igaFbxdKZX%Dpc9}X2N8@~DFhpht>35;
zAV`1ZF8Hi%&I*V&>si3duSp&LS2VdC62%;DT*Yw>W!Eje9`qjN00%DuD}-JLqfd=r
z{v^}RcJW&OH8|YK$Ztj!yfsp<q^?6-E8w-F;dN;~Rq*z7IpNK)9hS&Ax|E2%_~$nG
zhgV$QGr#BWq$=S#2)<ir=5XIJd^2h&wKpVIc}nQfB{7dC)GqkD+^Qz%SX9KZDz(Pt
z#WI10><nraEwl<#%xMpGV{Z&4aw7}<7L+L+Hnjt>G(_LC=?<m%oLCiAp0^qy88NZb
zXFGc{f_s0sQjqJ4m2f)#6{JOtf|4oj{lBX3FPp7AoyiwP-Oh_Os%vxo`2Je%h4rV5
zcbbNo|5{Z0VnE0#PaVAR1z&R5RRR|>l)qR6c!Gl|Lx31nb6HQDb|6b}M~ctb!A(fm
zVEs+n-`CZW3W+)0{G$$tM0OHyuT!=D*|rECcbf_@YG0AvKZuxZhodT>ARo12!wgbQ
z!zc8p!95DQ@_y;g@WTe_u}?pYk;>cvFEj@!-y{G`xCn#{-Mw;hVAY1J1SW3#rNKUi
z$yDJ7+|sMsy6I$u#4Qbnd2u~g%!^dgk^cs`g)fO_42g-F2)Ko(7QHX^6VhyX$9#>K
zl_rmQ1;2h<DWBBIMS?E=ORa-`0%uHO!wvSE7VSUp^%YA3_6Zug#Dr~9?xe4I+C?sg
zO4ne!v-bX%4>aXI>X*k~9Ic*`7!~1|;+T;>cTx{SZLx1_WD7Xv>lR}Ub#rZHf4$SC
zq)Na}9t9h6A2k;{yQ!_b@2?Cj%D9;VE@K3Z>%?QSW*rw-4pw$e!%=koN@^Tt#5TL3
z=9^(}*l}btP0aJdj?kJcHX;!`xeDHcPh0ber=gu90%*S4eT}KSosctmP55SevdWl)
zjc$@tF_5QP-I_tlU1?3C9j2r{ko3d5we8SuzAQz>ZnxFO!l2Vo*}jsgHve7wGg@rn
z@3h)J!X=`dGS4o05;f*I=F^om{Tj#%&pg=(U4kcqVY)|T$IpOj3p<j~E<$=k`Xda!
zth@IdZ1u9SGPCxSo3E`t!CZN4+D}4{MHc;*R&Isw`j(onktH4(FfcHl@4098E=%FI
zUn}~CRW)z)>AAt2=7_!9evs0extT>Gg+$%mHgoP7g+hMr$)axO&b`yW+X-q<%lh_g
z@nDM(B+=sUX4gp*E5z6CfF=Yz6Ynnvdp?dV$mWdGMZUz0T1F1y;CTi)l@s{V#VIj;
zWotW!^N*uN0#P50J?Wqrc%LQ;t(g?PYucySdPrfZEBP!@egeB|Rcg1gI({D*)VV%b
zJ!9{81{DAwTORC(XLtQ6|HBeIYq5l$6gg`#O;U0$m%#9Cx}?42S)Z#bMA-CcZ!?`K
z0-I7{bRdS;>JJ$ytFp2OD%5;;J2{)TG$q6s5tDfT=zvMYQP(4ub~NA2#2PDplrd}=
zA5WEpsja!K(o^bZIGBF3sC!3dc%=3Pc22vEQB!TM-hQ>>Eg5>>p0-ck*G2Yj%<N%V
z>;$vshcLb7`&aN2CG9_RrTMw2Hv9+Gx(-hEDGN(%ORTNZ(^PWKkgJbvu90A8cl&br
ziE$t;`T;dPbeamvORm;hUO!jp&B-mq^NA&>leb{f&Gi;`(w^;k6kf0?jc4#NoB<S&
zB6NXY(ANC0ih_l_RU+%!R%F&Mbo;!uH6Dz1aRLoNN|KF^Ika!txpFs;Cx1tMXPA8W
zMa}T*Zw$8fgw#w7&<BNaue<u#(eZHv&@R_vx?QWWkrN97QC0XL@b91>mY-B^k5+E~
z9NJ^}vJSUSeJnw|fOW=i=JpdO!;}pm1-o2a^(?Hc*fCM3FbkZ1C|fb+XldqDj>_0+
zy|(SC&s~Tn&XWI%Z;*HUD80;()co3mNBN{lV+ofj|4j(OF1YRI{f1r<F?Nb+)7fiZ
zK(RHlrx&Vdzwl-=z6{GLj{fm*XlnT?Yah?Z^)^K6XFY9;M=7W3%W5sB`q2+t%kfDL
zDW%-iGn#IX>b}ODPD`0~h!<HfIHrw>{it`Y(WDDdqV%UKQDI)eI_@|%Of7AFr$*}2
z18^ZP^kdIxR{1X_>VY{;c6Hk7=nta<ibMk+sUXgJ?x!6VTm!EOs}*cNT7#f9s-?3e
zK3vxBjn-teL|uapaJ79CJnRu|W<Bi~csQmsishnl!y^$XMAsJIAi&mFn_b{dI#_C6
zTYG|u+<`P^YI#;4)m*%8TXC`kAiWT;KdB-_6b$enRLhsN5&c`fPowVjYxmUxr%k74
z@l$J0x15vt*MD_;8yI%S6|0faVBFuzTK^E5y}Zaqx&HKges1m$TT6yc1M2-*+?*`(
zPLJ8&`*<|nzWH)C``F`frCs17tII)stkA+7+tXb&+u!FOEyII4)+f%?#vjjL*FPS`
zP~PqmQoN@Ro?G*AA7AyYU-O(hc3-{T6*yHPILi7V64ve5tz<WIa18>ruRVj?{kWb*
zUxL;bo;_jdPnL2ytf$XxH$Jrwr(mg`1;DqZpSIlYh_tuu6tvGfjz2xL$w;ywgF0lZ
zc(`T!ZZ0z!D07aS8aLbho{wVfn|$ZW{g$Q{sitdImlr+n^`k2=4ZzT$=Ipbz5rt7e
zXWVe0592I02m8>g><0+o6#&aKpGg~Um-XIy8o!8)>8q5=+atQci2;F}uqh6buydov
z)MtFPbTB}1HLkvQ(6)M!yNm2K6Fj$q3KH>p`F_}0=>oW=Xge0%_HMX(dDp>Op+`y`
zw^&Kdx`woL<K+9W+#o?32b98pfFRBETY0lyzO%NG?r-Dumc|dKEy3{t8}P3cv!g)<
zVjDT}m1r&xT!-T#+Jfv{O<r+0*wGi1Pe&~X44E4$I+HmGS0yb~TLmtC4mf@e9+;j)
z1XKw%uF9D16?drMdV9k!Mo-bBrJg7eQ`Hv{v0r0`^NobR(){cXl{W2?+q3=ybs5Vj
zHfs=-(Fw6p_4_jdjXHY#(Jc)nubDskMY<5&9r8o1MC2WfUM?rPS|lq;HxfgrwVmxa
z3&YUJV0lTAvzN~VXHf@#E+gmbY__P?ngvdhWno!scJh%r)HnhCto&KlMyD7QPw!au
zvJ7jR13}2#ZysK=szq`={^CFB%@6BAxahB3>Tf@2kO_4o<tP1#FGS%gRiERj*P4@~
zJr+_-)*|6hIbp)x_7{0}<*N3~F1hJWB%<@bR~fDE`t!Dn1+j7X7<O1}-vZlhW*Cuu
zuY<OI+IHJn?svK0v;BKHWA}LF^l#KiMuh&FHn(fPWCe9ZvbEozn?rv15EB!hwj2uQ
zpnXKWFl4{GOKkT)C@*I;`wAG5lE3Fc__Lpxqa)q|$nRl35po2zve7rI)8^7$xzY2l
zyFIkeFW7mS3UnkzoXKi^@bj5IUOQ5nCdb^W3ik$kFbHv`&uve{=PHRjh&&!loDC?{
zug1qGRUc}#@9@UlLRyXe{9rMV<>~1qVmwn|fABnR&jNcWad4T9a^7=|_4h@Ce%v@4
z^K$#)#@w2hR`GOn^swJ8M5!OqQ$gY~9qa?!MalMW*-LNg(*y-9ZTW6Jnd*H|k&!f(
zLxG2CJyct!y=1%(8gE;@EXOR5-R6DH@;ChuCPi}spNR|~=q|iyT)7AWflr~DxfnR3
zSF&M%(W`Pj^?7H<6HsCzV@Yv(sd7wvZUm%C^PYNa1dt+`@jVB>n5B^s+<wy<hDrWH
z!wVzUVKrw;udDYm*Yn7l&9<~e2)R00YDmBg<yw&m{N1nWDWR$dg|8b4n}G5JOkp52
zK*~C?!bEs8V5%0e{Q1>S-7N=CU4bTNj0i#DA)DHt<~yTfmYG!9Qs(LmGlhi*I-WkU
zJ`NUF$}*|FQoQ<9A*#0QEQ1Ul%Yfz42#jw<)(m9`sxjJp|N2#9dmBuVtgwK(w?1K^
zYZfDDf^UE|kc-jh5Qw3`VoLa0fhaxNAxlur^Fp4k8_9@BJC~EbQaT154Cj<~UGH{H
zKEnM|mI<*jueD!piVFO3?)U8S?D0((3qkr)(+ru!2>69e^*dtSBUX!nc!hCwo)1Oj
zv{(r@XpIRQ@CFHGMb<ZW>pe>8-L+yXSAg9rQF*2AxdhX+7h}3b@ymKVo68zR>)5{<
zuXk_v?~bltb~&UJY1}cNTd8=OnOieUsSvuX5>=eMy5?D(np%uWpp&`iOZO*3+4gJq
zxA55R4D)-MEP#MOR)7AS&O4rpTOc+;_|LY|?TFJRLV(zF{4ja5q;JPF^o9C}XW1jt
z#Sk~}5isWP_62*@X|U&!42*#9=4i$J6anX4XtE~iXb*D71l|Q)x#z_dDeUUO4=SDQ
z=eUBkB+xv|yX>7)3C((V&K!?X@&s%Y=kcK&I$dSlf7-MM=hBpL#NoYddrS$WdFlOI
zl8yq+O~#*n{62L`t2%6^Og?VxJ=`OQ0)e;RyX_!JJj}=JW)w%*-tu(Udz+!Qi>BX-
z?L*u{PwJc1XrDWOCPgjH4D4}`$MbYK?j#(DW-D>cjEI?)750?adaY2(p>Qj1>UZ}o
zVvpzaZbR?QR~2>=id0p|bJ+zCWV9`%c+p*UMxbt)d{?EZGRE>9L`Kr3U#0DB?+(^|
zUkO*dd1_4ppp0BO%C1tYFSq3+B@u}PO#~?0Uz!0UtBX5!xi;R$qh=FvY-;AQ*()Gn
zWvUnD{y9M=Z%NAMbDF_cfV(a9gwQK3n|b0P)Y!K>Ta<<@%zyd?nJ*1406v3H%lE{G
zr5kA$*@;c2z<lSN(!I3vYg+uVi?=b62gYapGIgA2@mx8?ft{f<pD(L{DhpSbQ&t{6
z3GSB{ulgtEdnOXjLyewJl%0pH+-G^}i@ChguIISx6yKQ1q(y&8>6ivgs@~_4ef&U6
z(Z7Xe%YC1#AQus&JfSfLd8%hSX`^{3IFnlL84?57*K_^c>Te5It80BF`!#UZ!!W1+
zof<W$pt=10*AyYA-l&TQw}4Tpv+&dV8GGZ;k4@xt&v*Gt<;|ZS09SQB`8qpy(2k;i
z<zgD*;wJXZr*f}28Fs^;jSUh-y#P$W{pnAAS8=<~nbyx8Le%Y$-Y}>f+tcy&KHQy<
z_buAsM{S?GU*@ncKjf8${hfx#gTLKCLqeW?9^3uyFPqzC6!sEZnfvAZy@gne8~exm
zqZ#^u^Z_Uw%ys62b!Bc+s6}+$<O@SbrFszgve`i2%PMz1KxIGyV}_6PG|zxv6ZSfC
zvHLXGaYh0UT_4~@hxlB1cx%mnbGX-6(D<0hPPtuXZ>2CSJP?cF11-lCi}m5<1S?KD
zG%ZUHDYZSsvK^A?g>1)sIG&=;*1QaO?kpZ!5>8q1egAPOcU9X1BWlk$9rfa<E?2N%
zJeqsgol2sBvN{OJ<yEmRwQl7iF8GI7b&UHEJ$0oNfy+wjW+QjUI^~E6?g9wYI*mw;
z(2#u~^OO4ccvF@Iglo>dqM4UV%x{Wtw+#}6oy+){>et29<a+eE2~vz20B6{S!q&22
z5qEoqa|XxX<SjS!W-~zFo~yug_dCMqE<TaG{o4XFlf7;5d-t!2{_O8)mJF+Un9np3
zfH-Jvbu8`>4T5HH3+7Mo@N#pK+xf-}<o6YHDeobfG<4({)wSx$a&XK_%MT<K=AK^M
z<w<M-zKnibwyly$-sI_TpcPT_{wiHNYgyr8E5Ml@C3Eiouz8YQAVj8OjII=t@5-B)
zT5(A^f%-%6rbo@OZ(t`P6(}X`>ILq4{GSH4IK=$sgm4Pgt*RG3r(yWqwfo<lKRn+d
zf+{ZCKF+*zZShAOzyUK7{=fh^tOY$gK5ifEAGoqTUDw76L-kV>ueiKl4&3_Go&s?j
zqZ`a1cm+h|Y;-sSRxMPv(!OJtM!%Y;I=*Obeh0*Xp9r|y177Wb6B08})g3FBK>#%@
zpU8b~X=)2U70vw&R36WO=TIc(asu)=?C7DmXF9RfsmW?KmSjRqegcX&hvcZS@!6$C
zB93OBF(K1CFc<qj19v7hWCME%`Pcb@jbFsBfPC&zq?`%D@|JseePBllJpE?nkHX<0
zv%H@)U#wN}l9<N|O4A#D4T(vDH8#efQ-*dEHx9AYnI-!=JXRdWsMTo^WnDl0{jD2S
zR73G`i)k-SJF(|=_6>*gFn3GZ&khZ`ivpa^7j(9i40tcu;$vqG`5|iASxG#VzooaU
z)4z^WFz{0(;!aYZ>Q<q%>wmA!Z7$g~;TmYAjqV$eNt46}jcSq`WmOpR(;R)azLJ8)
zpoT?C>%I@HCmS%G=1i)Mt|ivBVyjV~d8akYExYJV!aA`^lQ|tyz^4CV+i6E%PP#nS
z=+6A5OOz7hT^4^glD^$q$T!7kKZ#uw{XpvH%Sw08xaYSIjFEG@ri1%!yy%z75x1H}
z{Ow?-Xi#IXRVz^HTgcz$K%UKe!_^p09=p1f{QcZ1;?xMPpP%l`e*cu>?6qrq+T##o
z%{ZcG*0zT{kNdSb+vwLl&1OrM$l%`$7^lsmgi#B=+8y+-Jzd%-z3Z|x0)eQuFEKFO
zS6#Cwg3Ew%r0rO70TFS0wY(j=GWDUQV&r^9I_BHc?X>?FQu?kg2w(>3vswv4^aFaj
zOZF|$0$Q|eVUVGP2QB@8bRYe)of13A?}KRnXOq*B5;9OgXt8A|<C|ZizE{KWo?2@i
zK!Vh;LKSoAoFRf6F#QlFZ3;Oar)ON0;)THmS9Ov?cX(UzVw~BSnN79{OSvYnL0?1`
zq8&UYOzKJT^+b+*xz%1DWj?QK?4X!(AZvysjoq*4;(w~@3U96$ix64B3e}mDvM&hz
z;}J*fmQCGa$Ac6WMsvT(FmkYQFSq(pKJL=FGkCIqusW3wD&C~TWH$ZQIk&%{`2iW1
zUd4rr!yrXH*jwtS4B%AM2L{IW`iDb0J+9D&#x-YaYTVI$J#K$-?$I4*K3dA?NV*;)
z!LFLCH8g4`s81a{RXM9z>dt%y*>l*kjzW#lFm}&XfWTK3ZYkVpfdVlsrHYBSFZAT&
zzeNRaknqV=+p&ej9FS_nx*(?ZXyG{h)SUy8p^Li@X;{8txNbvxF7(O$($#*&D>_T_
zRyDZ75tWM@-eMg3(x)A=2*-Gmt;>G{7SO6XqkaaWlN57oMd2mfl<tQ5WfE)^=n`4m
zw;0!HkC+V)?9XSzE~dsHC*syWR~Mc-fmnq<${r|6%_(pT1g3sC=<P~p{p!<$J=ojH
zN*AyN5vY&SBge)51!7QV=I!zQlC^ysOw*%|5-f#!(c9a*_B1yQIF@%wvZWasPf&lz
zuI&~cwboeCUk~zRnOgm3X1)Pa%!@9Lr@m&@_cW;Mm-h4J-t*H%HYE40GJz{lPgAin
zB7wNLWlkEO*P^BHLF=tJ!d^g)(Aj8gH_irr)aWrfm-fG1^O5_~PQ3lM{}x-f_U&7|
zEmv9|LXi8)B}in!mTuv1PUHt`LwVlHTmkQ2-5*#&fp{z@A2(+Z_Xv2X^48nIJxD>$
z!H$E0xCQbuO17_Weo;()ba|UJ>_((*n>fGyt`Fs)(ZS1~okY;JY&ZH=#CtDS$l``V
zavJh;x8-t!_|*4a0~%f}9zw$<#tiX<rj8x77H*+BX1U^xfuE&}oo&|{GM(|0vuhdd
zJ>yQ330{8d&%5)JQF$b^DD2Z08ep7-sA|G#eGA*ojE+_Us!1!0ukYtts$@LP6x&u!
zq^diCguP~D8P@qJa@3Ap2gvM}4W?{r2kG<Cnum8@3V7e6qvPbbU?+W@fmpbC`LN>r
zQZ6!ACo5Qz<JC7G(~yn|5fEyt8PTtPcOwOlrI0Xnu@&6C<KQ__a$U0g*>Qnj!vQOH
z2ryGmZ{ajfrE0(WWPaED7uN&1_OjaSTnjAEhUa<umNXe;nJWCeT@HaMNK&>TK%HFB
z-=F^K$Ml*q_~k$dsbt!$)*MZ_UEJAW@3<z!;xrJS+GLuuHoA20FfC2D8MnHXc4cZ(
zZ<_j-wG}x<d@ol9Dn>Q;Eg`B?(9+{G?9z{F#*u>O$ai8ty#8DA8tyJ&fotcH-ni5%
zSk`sgu*Y_`HNU-&f0N<A9W0=^xfQg$ci9T`%0!-^D;CfNa{JS^5-L8>I%F;1z`gsX
zFaHdYsqF{xXNs6c{><|B>k}4{r{2-}6)4dD!Vjhli30I~zE>CTRld%;KUb(<tK2dO
z-bfOK4ISE}p>&OBl(rmZp#*R3ymP(gS#v8GG-;cd_W?pJb)3J_#Ix^#{Xl*z5Mm43
z1%{J#IC5LgceA}(9hNfo+ksq+kdptRH>><da&{@Lw|pD|Nd3(<u)(H*G_MjUM~{K3
zp?3^oAS2NH@&J?QLDqWwPgtw}z|>{yCHb0~OcL?ApKWcQY-LTpoV`L~9M&frDy_6U
z?SYG=Wm1+f+pM#qqNE7l!5wovyi7ZLTi@x<O&Y+lNQTMsd*2(e%KsoKt11GLY3gS~
z4%tsOJ{M<`7l0uxfBpd^M6I;8b$CE&21G#Mip}I5J$;j_ToehkQ^$uAv18<i%z^Gj
zzWL71F0rQOoaBfX@FVKO6P{!(-dI&cA}~?Xj4SH)iZ4RJm^yGDApsQ`CQG4&in?=C
z!jv%|6a@#&0wyD6V-V~aH^R?nWG4AzTIc3MOh6?&JR+p*LUIk$dgGvF?{6ojqG<=`
zpWgMg(pv}}h<qrZZTF<!4IcjkL!1mG`fu%h^n-xnLe~5}1ropPoH+2!eFtS&+S9W(
zN*-oSBaUAH*@EB-mVi0A3qvh^^kkE2eb5fP%jc>SRR~wEEnVF0NN4ZbohUj`tA1DH
zB-%z-TOUG%iaVT#ZxVYN_PYpr?(c+9>l?CsYj<CJ0FAoi%LO{}@qB>0nk4AjNYy;3
zl`^8ULN6{draXDx8{K&_xycSo|7tst^A1m~nm=*Ib5dIEMJ8qM{qWD^#UC2Fq}Amu
zVmxiviM`Zo_v03}bS-biU{AZ`fgv4j`Hv}jEl+D~r_lUH!yw_bgtI2^PoFxEZvdC)
z2W0HC?0kY7JDq6s9|J2bueVF=6zwa!N2eZ(;ec{p@ar>-ZSStkbMH5@qN#hRurYt>
zRP^8;$yhe7vzH4J@pCTY3r{f2LN?ffWE=0ZK1YET^MHzNq_FY5%V053#+?1dP7V^Z
zByUO83Z|hz5!?RP$eSDKGU*yXGU|r%!g~(hLc$FjzyFBE{3&_bI~E)^mnY}F#WJl&
zyT*7Xod$CAJI>laZEOGI0_2|$28&Taz~=@l!Ne>on|T0wl%VasSyBnF_df7C;PHWl
z*)N+aYZP`FJEI7Rie`Rs-S<`>088cIPsv_~5YC(Yi+|bU#jG3A>IAxeUY33+_kAsB
z0K4d%Uy^=ZMD_we;tKWp4gmBK@)vz1*_PQw8S{%|s8EdSj#kT;S?z!wgkvo;Nxi=w
z-qmX3l`aweK>(nDp<->4yq_cEO^{}BoI){7=2&pP*)LO-{RK-kes2EXfb0eBC4zao
zB*6z%pjpy!N_c5CDGA9^G<-+<<DM?iJvy-q-KOea%(Wg>+JPehm_XD(<VX_v=J~S8
zNt_t=)x(I18i3ovTOBjVGz~fbqKlzoStfq#NA+Le!{MQ~OGcLumHv)a5>i8w04Qr#
z8+qA<POJIv3+V;ZG?2(>QU_A+3GCGS7rt5!hiI|_lUEWlDPl2Ir2Yed0)6vNL-xTW
zhA}FpICTK;`YxZNa~aA$nS}8?IvqsZ1*wd}t~LV$a#*)XQvkTK%Ri{=fQDCuS)y*_
z=13*HjAp<OokrQ%Hl)VT%#|P36m4p-A+e8FGp_dlPOY<-xw)?$Nc#rBkXa;i>n)<|
zXy<Zyy-P$@2`9*G$y$34Lcy+zs6bv$q~Q5*t@S5(?)~58^gHDh1JFLkzvDt28n)k1
zZd}pUzUoaiMoP#BZCrI|b!<3!eqfVP!>k`-%kL>y1{w<&DCKYL=Sw5NHdtD~nv`su
zI1rR{5UKoyPb0byICR4;lN7CMRJbDhhZFnoGtPJ4;9?>(rp=JqPQS^Vl;zE!s!zUB
zk^dp;H*RkAn^8*-zva3KfPz_6X|{|Ek!HUG;CT%}*pl4*N1}9JP2-HLnScAi0GULq
z1^#V_nU^L{F^8^=Ti1zGL#rLzPOf&^$73G(EiEji0&Bd1^nUzDTQy)k&*{3f0F&1F
z2mQ8VJ8<x_aVzLG<IzFOT`;S`MXtsI|Iie*ulf^3<g%|Y+p~%xtDno0BVk@c%N_aU
zsutPVXqqS3Z(FK*)P#<iyLt5F{mS_n$n&!(>{BCYU`ydb>inWCuq%x0DGEXf=jH!V
zoQF@%FoKTa{722^Zlh{F)aC^#RP5%3P@G8y!>Si%=_}Hz%>4_QWqg@p_Eps5!Gh9>
zN#9e?aOK%auI@kl)Xn8g!%qqsl=bl4-rRfzq9#G)>fTo{EA2F?nV<a#+kw=EROs2b
z6|cZOY%0bMOL+Oz#2Vq7YMD2Hp5rfo{U2WH3or<1Z~7dqv;#-~g2=iNv1Ss?w|vV6
zU!21t9U9P@F~COZ9a6lRtm9qqdDQousmx@hokk(ETi0Ojt^+6U<SFdu@@euJ=cf0j
zV)GyI4)kn$qEv0)F4+V-Kh@(Zc5cv9;BH{o&+VLxMNG$S5+sr|^CLseG#hx?N;;$%
zsObT$XD?TqO{Tf^>yliPVBYkxvzNHOyZ6hL^Osae8B`FDrcDzO5<3KtwoT7f)RYa~
z5i4q`D^>8cf+`o{akzX14N{E{3Wz<GMJ*yV*yNMDr@jGL-v0+qe)jE4P{)Q#w1Pb8
zh-TM;fjLygrJ#S$<w2in>X>fDU`Q_XVG}Kn7a1z0Uc#$T(qZ<7nl<i$F_gw~R=QF+
z;yi(iUr-nwys6r8OvjiONN{8e5>V@a8lgHB+xi$ynN;#gd{in}I&|=)V8!9bijQ&;
zs|b0T)DR9B-g5TzX(<pQ8o-j8zU#@Gw5b(XVT~O>rrSCsvSMT18%Rbp-9Asiqju;2
z*qytDoQRYZ!G;Ou@N=8gf`S`kjeH2mb$PP}+IWksD=d9E5A`#MHK{kyzo2bf#IO>k
zt>eFC5ii`5$U|plW)?-CP3}IiLER1%?bx|VDMjpd3ha0^+Gk@I8LucA-l`T@;p)D~
z_i%A_bd0}5`4DmAAu|RR)dHIKdwY9~Y2hdF<NqLw#6%?DE<I$Lr;pE7gd@U#<-{71
zjE$E&>Fev$GA>$GCvsIYEWgp7paG=jtbUVV_892!iHL~!2?iRiP^_OjxCw3gU0EHe
zgS;K?cg%GtEdmQJp(NI%BiR;zS3%B1&5zWB=tRrf<w^1<D!VHTmC&#ely}!ljiW${
z5ef%xqk!2va_pZcTwcDsf%q_vEo>TDnEM4<3caf6m)0$Dt51GqIX@aLXu<l*XTakG
z`}HscKVW(5{6`(Uf4n=e6b;J;T^lzE@<&HMFe<ib9!Krp69$R3;T3IiRl=v|2i%FW
zewEVh{8X<^@SS9SBQ*Hok#C-I4<}kMs&$rgednRG%yu1SyB?DH{hT*eVt>9C>&TNT
zV+an%HmE;be!sB+RnaVH?<81-s+Gv+o|%0l)c9o_PbP>C67;$(3@)n?FJnwn3$1zD
zs``O}URQ`sCYSVJJv!{uOC_@~d_)izMUb#P40%kGI6#oXfDaktY`=EBjX-;`eHDj)
z3;{$$M5I7OUIU%ppj)3z?YqK*?ecU*|3BWKL9EU4?MBp#LeS{_6HGZ)e-W|Ghw8<7
zwTO7t{c_w_6p;|&_D19>uTzZmc<Xh`6O>`xm0B!8S0M@=|2@330)fC9Gy|_^G?JqO
za<Rqq^~gPHF@I;Q3X2Uo#Ebs&5Y*>0#xL_9mkie<TtFZhl_I{<;jPFzEs5o~Ik8vT
zHS9xWwjFk#(H2M7H>?fGKeKnK!3}I2n23y6%yFOfW{ciDu=i&&o}ng9@7(N1)5pd}
zjIjoyL+n&Zwv&rVuwx15RRL-oO5N|v1~PPnelKYGJX+ttOQanM+{*D)ObQ|obdUqy
z`odDq-UsP<lur%2AMOZ14mt419?xj&khh`rnL)x22BDvD2f%H}`$-!NLmVGw&Fzpu
zi12|N3wcQn%JSPMc?8{jE32Xw+LReWApFOD%apDY>73?+_q?^JaqZ{q^7_IAZRnu5
zjL?l|X!2?9{Rb=EoEm|Z$pyOeN#nTJDn5VtIZuG3#&Py_@LY%PO|8NSp9>>Fo{Q;m
zo<go`x0RNmIr|g$m8;16jeAgbGI!e`%98;9)URJkq~6i)OkuNpKjTF8pxTCl;XvOR
zK|0sZ9s)-^-O>7=rvhK*F1$!vbc-k8I=sDO;L$ZE!u&i(^||+Mj_GVoDO=L<(>)&b
zDKAbD4qGx7Xx*`c;ZztM^qs-W#pe~5am>f(rSw7q_5u8#w2CNy*>3CuY(>q97no&T
z@~yO#b-L04k4X<NFG&XO!UqpH77%G)Nr;W_onTJmrOt+^`#tJfd<a9>iTo(kYIoba
z)-NG+asRd^Cc{j8cojE^c;l$iaUpNaFbWw+PZ>g;8w59!obnqh6kzezfjD+ZTMqoj
z0TfA7hlt(Bar+CWf$c}ZJAiqiI;Bxjk#g$U&_;{JK!gCQA~grFgs-Lgi*fr78%-^=
z%_PjuP~$}0U)RWm!DvfuX^gAR{;;8*lE;hW9XQt`fOumtlvwe<xBe_$v8g;S{%uHr
zc<~eK6A3rhQM70qXm2@MN#5!jnN3D|4R7?`mjK^@4I7kES4V^7HY(vSl}HK$Ptw%y
zcZg(#x(Cvq=cKv|sCMYJi&29q$T(6<Nwz1gjKP3lhPMCc_O)2x1ghfzHx-=Id)Wd)
z(Dw|DR2J?JLjho?^N>u`Z1Ff}{la%=1MeIZ^(_KDq_x}|#wq{Ib~6wMWk{+ZQiBr}
zEL`qWsDZdDQ@vqo|Bfzh6eo;+ZRRVrgfHy?G4cAuLkz*$aj`=)e=GvBgKiEEGGB5q
zWO1=bk^+N?nibHT<4yu1(l@b~S;r9W;J^_H<Ly#&pldW*Q8*{Oj4XCH6Uh$J^{e|$
z^LkQLK$pr40CZY}As+SL%<Yk5AsESFr3l+xhkFx<-py!=YW~|~hUyZWFbO%%c^a?n
z#3#Y(?w?4YC^zwOo=>qEW+clrN;ZUi<~%bBE)?X2LzZl#mTW@Y${HdIWtHjp=@@^r
z(j<W1ss!IxcY$N%JcQi7+#Ae4Jz7A$9rH7!lII;rDZ7!hLEP7oOYI0Mkh7<3%^THG
zbK(@DSu&H<1<Dr25YKlaDXAvA52lYON7=@#=lZG0tJd0vN#91UXb|KtK7BvJ&~kP0
zv?1pZkd4T;h9U%62@w@$=Z#W0M1!WP{C~+(<c%+0zJxZln!q}ZBBKo(L=*Flm!XR(
zgWtYtd_a*lGT0Q_dcW6AJ6jn_VXmIGN%OOiglPrHDt?-okdMyZ{rqTJaTmT`Jo%s6
zvvPAPt=@NjTw2d(O0yhj{~Y`BhN=<S=jm0(E7(Bo8P`LrDHpnEnX2_VwHG^y!Hb)t
zW9lu+4_BFt4kARvaK@#z(uy3*g7>`SRt&*5TN_5y!#)=9E0WX@{Q$Y8_?J)n?S_@$
zcA^-h<#wz`yqLp>Cs-2wpmcji`YuhM&LP(!+o1??v_p=P&U2$&*<5*%?O$>Wa1D>J
zFbJyad^D${I8Q1dQJk8EtKVO`pce6|@iDOx_YklKHAXwQ-CX6oFJlJ?0;_%B#wuk(
zlz(V6p%paex__^#t1SCvYb6z86wk#(NmXubQ`$Q5R#3QcC~p>w86=2CqW5f!?~TrN
z_!_8kL7;<EQMv0f+waBnQG@x3>%9+MOGS^@epee+J39vOarN$Qo)Z@pP!Pidk=xEG
z29~k^jWljm4TNYQB*|}p@=uHabA`vPJwiSo2o$+HL~52ZF0r)k<^Cy_!j9PZbMvy`
zi$(w42Q)9pBU%~6|I~55u7VT<;%^l*XYcZSDDjUJl;-%{;wtsV<VN<Sj?~g8zSxR<
zEVH~8Pq8{WMa5D!Hj{xxgi0jh%!-1wtfxgoiWpfhUC)}`baz`?9`;3TeP+9AN};r>
zxOK>|m~rsFn`EReboXX#wX-(RNB5VeHKuS{ZS(RwtQQy?0EDw2+#<~{9zs6|H1U6v
zb>9y+rUCAhV<7Q65LiDweGcvDzQTSc&iX>NW671;t+w=y73yOGdG09D-MS8G6R+DI
ze?GlsBkfCWQMba937IWtg97nqtc<EwDy+?-%infHZi--W)|Qbr?e*I995|C1OxyAy
zQzUk0f?S>~(@)O!gq}a{km4J_Mk3+0XPpw*TJQ$m86dquo1FByld_V+6fw|oq6fbT
zXzaXJbieH~3yhNzXKfNRG1d4e<z@i%Sk^j}4=V8|^IiW!?k;;o1i)&{&GMk-;$zP(
z45f`^JObz$Kv2n~0z-;&;?IFy0W^6~B4Zi8T7_a@0>PN_jm-EE6N4bF<>9&nIwe4J
znW-fX79I$8Xv=S_Y<%}mtEn=t6yaFGq;f%xvkEN?Bcwpv-%6@Z%KR@Brk|P)`SKV-
z&6C1x+OGsq4hSiQ`Abti4%Se;PSMR}E9OnjN=o*xXeg~FPeL4JPpWa<1~V&;4+*Gx
zVcLD~xhGH8yTm5=I}prBnXDkWJV&ND%8jz&607hgnA-j{ekjQ~nk5rxINl@o%WHZY
z%b9EVI=ipo);bM0w@MdH?US;y3vx52XNiRzkW;7?)nF2*Xcir&FBKosCEQRI3^#}V
zFnEiO?D>M-ZLq-U9U`^mtSoO!+m2Gs-IHJB3^z-*W3v!f_XvPRu(DP$ThkmRs!a>E
zJ_WDfp$50tbybxp+LI(Ikja|`4a(0`3MZKGv65f{3{58rbDARriEiNpag2E)V&WAl
zC3mIl=n7}KfW3AbB8-N3Kjz^v1`XdQ1p3G%He~zP03VuGMnQ4#82fd<<K_O6T@IDt
zD7%joch@Q#WslH>E3C`f^q3>1l}mM%L!i)I7YYJctd3RqpO+8LGfPW;l@v*Aun}(q
zN&=e02T}Bq(S?B57#@j6hMpBNGu0c)=FlFM85|hunz51@73la|LeEdfG)YG_g9e>0
z<V|&YWlX9`lNC%&_HeLcNk?aYP1FUL6|)F2*}EwoUfpr=(};Wb2&rNwr$Y+WtzSK+
z)7o05Hd3y%2CuvmC;;5j@5z;6hrnSZzTSI`NVzw~hd|9MWzQ_&_KJRB3lS(b0KCOE
zbA>4V<F8o)qU1A{gnNw9kY2j!2%X%8h!i1SF0aD85hEH3)>!dh+$qd>x3#;|7IAWm
z-YvAp=aa4qwxhNyIk9glfe$9va+$Q!#t9?$Mp9No_eOHsW6n3n!TJk}GtyHf$$7<p
z(u9O90;~D-$xXt#fg=b?G4K{^T-CkYrOMct*zUJi*H<?<nApSg1qFmd=i$FPbK=`z
zfvds?>k|V#U-e5T4>0>ub*|3_wC%HN>XrEUv6G|u^nu<(&EioJ-|=0~?~Zl$CVpP7
z&k1Cd+53m!{e$CVY)pkga~I^T3{4(>VLaFX;{lwVcMDAoLd&?cfrS+n35DjUrGi<p
zZZ0`#;`NP8bCWkT7kIcJzc62EoPr`iKf3~c^}cWNzQ4amQHBFP^AIG^e-4?sZ@O+<
zmw5uTSNhUzKFW=KRUA(J?`rRQ9<(qudj~OI@X);Gd@VvF0I<m(mZ7?S*58NW8<X<@
zY%8Cznt?4gR(5<S64?{*3ZtdNSdjp$4R;{PN&&EjH4OI#L4bboCK`LyrgxBDrudAg
z23bv4DrU^$g_l&r>wI&7R}|AHm6%t$KcOOg)-hoL>)d=7pf8*Yba!1n*Wbg1VvLN8
z^&)y^7y=rW9)5F|Fpy6G=-Xil8JzSjRe%xa)><VX3l<1C8b$Zb=ITl6WTrjk{8ofq
zpIa{vpT^?RWKzD+KKjsa^sgpL{5^cC$|`HC>Krw(z~Iq)YjQn+%t?Mf1SH@n#&{wj
z#g|`=c-ds|v6BaFW=NyFatBT77LOF0r)}vd=;z;Y&?t3nIC;y&##RZem_;hsa~(tj
z8_l7qm3CAWXe^K<_lFtf6c5WR93dy_n6s0Oo=qDAb@;(e+Q{q~{T~Zi!Dpm8i~eZ`
z8nX7awV9P-_0a_h9sU)A$fOB!1>IIS%ECt?CNW-p?&fKex<AqMnVp54B=Mu5OtMj;
zu8l=igl$Y>+)PWM=3|EO1LxdmXlDO}W&aU%H&T;&_v6e{!jG1iD&A8q4cqR8Au<5$
zsayPNHF~aQ{zlO+%%&w7$tiasEiK;Gy~~c2KLQ&ZjqGXXmovev@1H{&=woP6vw!v$
z0$78#{^uX&dz83RV&V1f*LmhUnS5HM*g7sydC&e-=b2K<l}E-?i-ej?e&;L^4eKj_
zbtoLsxJ=dP;_zPEuz5OKw2UVWFcP4-ouY{jkhq45nt*NTBZJz^l<|3xB8RpcsfE^e
zSKn3Bt}`V6o7|Yx_OyO94cWG{F7NjahrqVZ%G8Al8RPfd?}S}dmf%BD)ko^w<)C7L
zli>%j9LKcl=H<5C8#~i4JFKxv{9FdkIi})eQFgke`EkjZLBQdvEP14L6m@Zu8<nV7
zBitkj=S98*vyiO)hb#j8Zty#A6;-^-;0<2_gyhx`r>ncdifcV#M0dMzXL(t?frp<#
zUb$!-DN5hKYQ#iI?Gb`BFiF#f?exY}xc4X*k1<YEGn(~5XH7UR8ZJelKN#B}s$bVs
zsHEj{EHgLYv6Ei|>`M-e2NQsl1f&t>QOkx^d1fX8`c%T@-O|IkVp)m)%G?~IVT}{N
zKZ~yNu+wh80(f>rU{VSy;|5WHX21pXk41U)S&lOSdvB3UQ<^<E|7p6LCV{c-p|t%4
z)M%xcoEEPzWrGN?2`K%eTKpYSAc=hra0UHg?DA733sw>+GSy5-#tj{S!o=g2P|blD
zx*)Y5m2VN3+hvWFfBEhj2+sI&TN8Nh^`X-w)N>&Ul*x#V`h0U!lf&r_B&-f%Q=Q|S
zDbwlvUMP338kgHJSTf0;LXR8*fYpsUA14vu)(KROd|3lyabgg>X^gJ0?-ZKCr%T%x
z^|LAj_@ke0(Yg-Z)Xq3>@q8T#cCQ?KDRN$kD)Jh(<6^N8gBc|mLYhyiVHuA>j-n^$
zogu$DbBI7`Wl2eTxL6ke#>jH~(Z^HK&z&{KlJdGvZa5aB^xQ{(8_3U;lXdskM0oYx
z9=}e6x%0>@0&BPN5b_k%Jx~*)%WmN)*o}m%uqE%7t~mlAOP)T;Zh$kf#Ntpmbo?xt
zKFe8k$r$SF{i`NwM-07fyCwG@Y6h);lVX3GSeg1^+EUX;DzHp-AqoIi#;n2te6K$&
zS2)qp1NlA##<ksO#{}K3SJ)+76K!M7e~;?5lM#wSBr4~RG`sBH>P68zXBg{p&`^?y
zySc80){PO{%SjSb4(Jg)2@U}KBivzQRiE}e1{iux(PHRO5r5alt0UIvF^>!RoxmU6
z7K54^3KG48r{agi4FCb@chnItQ1W#^)9w?h&|3#>=qLaXHpO^5sM~nK6J?w8P1gn)
z1=aAtm;`m$@Mdi0-HA=of)%HZ&FgXXZ`r)jezX!aTYw?a7~`9f=crp_(nl^>B`V0{
zyCn}x0C_!tyhYscQL+@Tr&Hr41DuH27(4O}bxRj0O5~&ok4Ydvo1Q2EPFyKw{)$?o
zDIhs9JdX!BbO52Ae6(LFGaT)Lw-YstiK+g3Yca|<6Gebg-xZ6bKEp)-7Us{L(zXe<
zk=?jaMgQp~_0zGbHMrXq*!#dc_V|`UT}G(^$Ty?3tpj{4<RBR|y>kqlhp@IPwzf&a
zjs~b~ER?P9uo9Z`>pJmB*MfDO1C6D6fCc9ux-4+e@cdDpGoNYKI>}W$J2Fy4D627I
zY^EtxJiCu6)wOY<GJ@P1Hb@9@p|1DTw@fg4IjrSmUPDpm*(FzG6ft6tnwS=!38a+D
z5#lef=LR{G0MrpcFl`fRG&JXWQ5u)zH0KNk21UMn%LaasydAKdU0axT|Gv+z>o+UH
z8zqWOx_$$oo2ENNCcE3&0uZK}saq^eLZak6(_JE><U%g~>SzWaD@{$zPEGVb>ai@{
z_@QGjRsBwgY5W-LHi_n>NoHzxS_HTCF=_+8i0`A!K_##u!eYPSL`MNgXT&s(c|*X0
zRWWch!A#L_awAu^mi|;!Wh8TD6LFeMzF#p5$Zm%PlzZ5MC1^IWq7#<k8|IHJbV8=L
z00z=`BoBbg2H>pZ9EU98433#bK!*wP6P@}pIXQ3gil<<yd1~Y0?(YG#BSINR4n?;h
zO_ys)&P7dl&++BL1}hVN-I5M6f)46-S&q8UwIM=Mq|qD+W$DDDSFjxf{rLJ=J6RVY
zCoYSRtgXL^N~eZ+(>?S%aR}BHXY&6&CcAE>B^pJIQz^JAtwwv%hIMZ-G=q%h)2;^r
zPInczzF50&SJCf^))dX7maaO><`}gTh!8PuniL_-fX%9d{yBi?P|HMZ6=y#hL70sG
z7sN7*gB&N_nZXtv4D{51$CN_#9e!277yom+PccJ)S;o0u^*MQ~ZI`~|hDm*}N%9$@
z7F4*Lg#;P8=+a$T0icS1$9=A3xd{3o%P}$HFcvH9QHKiB*4Mv6fyW&7lo>jv<8e74
z?sb55JU({4+Qa#tWOn$8uPM3Xu?zS<>Y!NfepoIHq=7@@ROS(cAHD)dXaP)7Gr;VL
znfMEo)afc-IV}(@czw4{pAmk)f-!Z@#@vyQaDN&9wO6(TnoGZLa2NXx9+Y8+v*BR7
z9~O&vTh%8WF4pFU<%s?lJp<0X^)<1Dj~~enk;Svgt3Yw@?j_HzqJgflA<+N4c>4Ya
zS3CIYPm4a2x(nvL=(9&@%^wdFc%ObCd|!zr7C%7fLj5xg2cnf6A1OKpjwF#`F+`rV
z%HMS3RD#wWnz;CV_i1clED!7K1NBF_ok-p2B~rj~XUvN{#tQlKUP<LMj`>u0x4Zyz
zJH%5#c!PPg6O7CgKLX|v1dzU6?1GVmuc<$XY-4Ya_%RS9*^QGTHNUFE3JMDDs>oQA
zp0wz<*<cE^S&wU`Lb2iTfl(mx$wbE}X_p}0t1rqT-1smBK39?6JHQ{}bRc?@ueWBf
zA%k$;`oH=h`!;dCI-7-Jk|8&qg$1r?!%Z>*frl$9{G>kN41T&q#3qFG9{$ziLVD+K
z(FSP@P{p6lNP(PZ*xW?e`yI~e93{m2<Ze&}2_~65QP^<zZ4P57zAmsB2H-&+_`Bxs
zB&?5gYUyyhFW{k%UGS3AUb(4e4HFH}ivS_XYV2f`<C!7oMhZU}HY(QMc=s;6Oc;I%
zTNqlXC>1KApaGbe>KO+fv`b9%2m?u0>zUtC1Z6yH7@}y3M*Z9Lkq+EBX)q(ZfdCC}
z2RsYq4AX_PxW+j&szv1HT{liphY{VC6n>aj>t19h(2ER02N~AQN10)WR47czt-bbz
zDaJhsg97qNLsaRwQo5H__<=+811k7|rhK3D0^O@AGFpgI!iYZax9{XQ!VTmGoeMMf
z14mkx<SrEAOlGJ_wg*+I_96*Cd;#oouv1%{DytA*=LC1pAMv65CxE>B2v^m%=C)Q_
zJ6Htoh;s=+Fo$jtni&zDmPSg_#J&o(5cjDnfIu!@TP0+Eo4BRg&fbIU%<lOCIN9RJ
zm+>5_gEjzg{Zr=RCFdQ~C<WU;LkKrU&dZQyJ^b(!=jNn!jt2Gbu@q?nh(4lp8@zuD
zCH9_34OYty?*1<v>Bq;$o!@SGplFJFrv45tUg-4yxB%X<-7Y6<$XQKPCdbIa#4V77
zw`nX?U_xh9|LH6?WN5xjG$#P02miC(22}#`lABiin-Zm~q>2v@!O9F8WSOf67r~Px
z_iVT+1*Cmo)W5hD4&1<#U%LqIy7=?hINQj081R1+2;9GnV`FBmqS(6)o2`a2m{6HF
z#D)9Wy~+3uaSNDk6cs|lzisT~<mpy@y7r5^6utvm`@iw$5DMPu=S8`%&uF;?F5`6Y
zl#6~lm4_2@Ju9j~^MMMs+%JGXeHP|nFY4@`>tKI?p<f6GSGwhO$biJ|3Z}_RgJ~M4
z7dZxae%SR|0_R*609UkaXfsW9_HA@Pn!}-o1Z1JU77xGs>Dj}=#6?!v7=tup1Cawf
z=#()hy@)D+-uze&+8)lrE2E3UXcaQZo-w<<D>TLM`D#h|!d)7j_iB9SOQy!e-n4lR
zPQR#4dy_S^$O`_x<?_S)>#iEVbu|+AJY&8`plvOj_~fA9VKq$}W$*%f==CAciT!rL
z3n>v52b{i=?s00|-s{!(J_Pv4OTLT~K3Ik*wXM~1rIR_IjKRGPt$B0OX(O6p_+&rC
z{AoOHC(q8WqT@o!m)noJmDxwK_L6<w@&lyHbc9Qp(G`3yM~mKJC-?saM8OpEL;)08
z$d3~s*j=r~&eB_7$TFJcXxyhb5mzAx?Qi8|GwiSK0R*J8zoM{z>KlT0W{0Un0e_In
zy8>75L0Vy1%nRCx*y=v?5+ITQE^C`%JvXJdS2_DJUO6@+NIyH{o3)D<mu<8WuU~S?
znPAnIa~nTHay}osFMZPGA-|(M^V$*S(mY%7)tu5-4(K-0xJV>HP98-RL6g~Scm%$V
zlYyZi@fC|Rpd~)y`|{B1fXAYcs#ekv9glkM&H6<hWlrUdFI&)Tb;Uc}`&?2pdG0{5
zHdq8^K>%Sx+TWfwc3;`<8by;kxHp*Cy4439oMA-E^}HNWD8)e$*;jFF#?BK`;Xc`J
z{jPKgwb{mL(ailv(kA;TY8*~|_eF`z6?X>%#^t&Rcg9g4iuZ{dJ0n)IGU(z=sOa@Z
z9A6hShG2ecd=ZOiL%VI&P+aN@EsWM7Rxo^X3!u#$YW<C!f_&(A451Q*QXhmb2tfg#
zbZVu5Ht(=?r}Ju91vAX4u|3j)6zKvW_=~P@hI}H9jYkO*OUN@b(?I`;UMf(KGSPE~
z8aEtf7?WeV^NGD58q5AU?S)0_8lJ9Le`wPkd>n+pRTv~CurbRsfU434)<HEhn*k!@
z6We8)S_d_x+J*O(rXdq>!aIv1$~MG2d--k0ipJz;`&_ccLI6^0*odlwLPK^!7Twf5
z-ekxkgC_o5MUN(&c{GByhjcU&^M2+QH#ZlQ16OtN>!@+MxCbATkKG}&H!-Wizigb+
zAYnwve4kprEHh&bU-n#ENH?0k$R7DDH}(N1BP8n?Z@=tg+tt&H`dg~7aE8zv^KOoP
zY+F<WC2LvH#dK<ttR)m*0vh%X!0W7-Gm`22UwplFSXEuuH;RaYph!x?76eoTq&q}9
zrCX5h?nWA^EuCAWq!FZB1U6FA-5s0Qbf3xX{haT)zV|)ny8HuStr=s^(PR7;#^+P+
z*YMcqf_<cayU{_IHVdWQ&W4%9#AOE^$Etri;^FoO;Q~4o(uCOqeiR5!((dL41trM=
zG*ikNHKvebasm4Nme8*^ABtCs6Ul#gu>4HQ+4)5NHjtAwVj>Z{Au?4IToq+Mp}$j4
z^{DgwddU)Yg?mom)v(Prc*nF(z?)|I`wSWoYxt*$$B~*zA$fetKb6N*6k69Ww>jC-
z-#d8lRE}1>7TzQ?8><Dfc*I2Srn_Mkgq6o$QAC9jj$;BX1mYUn3lsF!581auhI+j+
z&GoPj?5cQ3ZGDbSYNEEHtKTy1NZzsKe<*TnQ7q3)L7XzQDd}Ne=ZO)~s&8jtp_QM4
zRje6sFs1`)BZ|-Pc+%y^_Ta>m9zk>Q=_XeTgl3<x?EZL|nyqRr*%Nn}2MHi4Ud%J|
zE$APw^|uP1Urn`WsB?GqxGs$3m7s6fwxaY0KS?)5VC$u?2dIM-X3&^y$QU$BP3&4o
zpW%)sMAp%cMvIJvF7*dq)H6Zs|BJ)3UkJ0L&1_(wlm)ywEry=JP;Q(BL|03!?aVcM
zaev(JF{pHQIeH#f@MqB>{2+wO9HyrM*~Z>ziJ3JMahx$9rAle<2(H!-(38&^anw3o
z`5fB&uJLkb-8pB$dhGOl%@j84tmf<lf=VGE#$Aa2VRH>i9|c<f+=FWOT3<3ZA9VI?
ztjM*E-4>bpD*qBvN!Mc3s{549PpB`UmimRXqV`cs!Q$e;_;AVd{9hP=9WX#AU;xPa
zPlRSelbs&tX7kzEy^8k9fe`_o;<CwYv)-f$gVZ#&Lm6~Wj|=n=allN0ZS8MU47fwO
zvr+<%k7otVL0;hWSKO>w_;tebOCyrzG~F1EZYIPi)1K(6cddxID>w5oHtb%&o7jm(
zQJGo8;F|GRlcap}vpslHt5FBp&u1rjp^lxHp<4gQO{15754TM|^%~GlQ!;nk9~@wT
z;xL>f`rFRzP9yxQ!B+F2K?B;;qZLXjV_I?$+p=X(?ol&RS*vD{K->+%@;O%dCl*%N
zyMt^oN8;|j6lv;4JVFz>D7jl!RyOZcE8b~LSJ5t^MBaHXpcS;BCm27dH}o<n*W=|D
zWHGLsFi&}ahH93>m-1NXpQd)~N6~fDls>5%&2zTjCo$>+ux3a_+##=RtNHE+2Zu;>
z%$xkMv2~iD4zJ71tC48RL@xgNj!IUHa^IbmrD{MyonV6|LHj{xni_p-8YMaD1|s(O
z@c88D@x$l{5KaBzarZo<8fWAubC87ZINze@!exzHAml?V%EG&X)nJJZ5048FFa=T4
zC46PJicRw%2PZEP-q*uZ^aw;OHg=?}Ji`x<fDsd}<l+j8M|G1!Qy`o-HAPQB68l2c
zQM^LTmK{uU|9%J<1&oD$KSZJvq^0Tq6ldli-3@4q9l~rJh;s^O(?JQ~UkveA_>Q`$
z^pChc!~EiIYQt<^!6#?JuqjR^Yf=H4C-wC_Xz@yKj6bb6Z9NpvnlPXx2WoW3N!!{5
z&1Cq9o}8&VQPdhEZ>n4GZ{X5^HqE9qF<U?|+^$B}gaHW_s+LNzpl8bt79d{HZsO|U
zrQ&e0C@BmOHg00<{`EF8ydr4#_igiwXzB*mQ?mye5rK(3yn5jaf1Yliv<2-x3@Jh#
zPz=lwQ}-qaz~;tlcG}A@)8m=#h}lc6D?;PK2JNNJ3_nzuWFC<d-=>@YBHz4l4A;O6
z?+zoxNkk<>cPO6N*{3~dvsv`OsrjMd&+&X7sift|%JsFTh?O~Ddi4-s`oz!j7FNvP
zWw2{!_%f(nX;R-?t2k9-Lk_nF&^#$Uw8%u&_W2f#J1j5t%eXm993*QXILm@5BPodc
zpW5EC5-cgWQQlS(K2h;AxTO5dqCo+$gCr^*FfVlYr!Am@{}?E&zB~);4#NM1X+?3b
z2)jD8Zs{SBA7m^Ap_XU56&fn@@wR;K6?VUP!w)h{hvCyvVo)zH!y0BAWR-h;TT}K#
zu|<LnyIR5U8Rs|qX8Y>okG)Q0^i1-auP}*=!nm@mpTnVLZrt19R46~WYLYRn4(+Bo
zMnYbI*(2%u?A%hWREt~0WvwRAVr+F9{r<rM63rG4m$uEx;#|WUcr^dj;1=RU{1rzl
zMDND2^~EhfRiPq8Ow@8#-BF19;ca2Tvr93cXQu*MfLE*G->r%ga9P0lLT{&k1rs~^
zjn6$H9WP6O!lQl5>;TUR_{f*POE&gqKHMwO2lW6f9c93fKIOVatNIt@sez%!zxJ#;
z*mF6xln1~H`Tg}FZsDT70TvT^NsxvCaHh)6ZXKP74`7ge+kfD+{soaro!hDZ=m-)_
z45tf=+HIIR6aRQbhWEaJnbck6LUyqk?Ct^3t0)$ic!t;g!JO+9vHe}D9>rmPdQYW(
zB{aIdO$vmE;<o~DB&PfpEn&g`j{*YbWD;yY!trp-(+|qQCRdc7IkhN2Id$<s)wP2`
zOsU@6G}POLlHOx_yiCx|(d~J{MTAHcrOf~Q^@*5JyF+2ukwaU}|48${0A=3)r1^hl
z*gI)UrIEzvo=U!`t5b5SPL%G2bEXtC(3TrtDN69yxv_7Jky=+w9&yUDz_Tjb1#iKs
z#Gl)dKtMVSWZKOFvAlnu_YV{a`Md!MSI{H@(wZPP4w9`PqNMpi;_^8`o75!l^wZQe
zY0(cg!tC%zbTV2dC0bc)L%Zucj4uSlzx#aZ%>Z$GnzT#W%&gi<0fy=pnL!bS%v6&r
z5c8zX>11_8Rvci*QSz}ACmGiWn=0s?g$@0T`4_W<E*R^J>;S@a<w+a`AIocY(#)jp
zw;(R+>~w;OU#oKhx*tJk@TZ2o6aIbtI_toSioC~XMYk|4`0}n2yTZoShPujA!x5OY
zi+1i)p4p%+jWlH{-Xi^V=}65Zlc|YKtjmQ5-j&hWwbz;Z1$C?Exm6>1UGF#53s|%`
zwzd)tV@g+(7hm6!=A5wRVN7+OWKhrht{G06NWCHu0(D%ov<@7yqA^iP88oDq?vCz^
zrf#UiQ#AMpcon1@KsZb}ler<;S`@Uj^wsnXCqY1$Z}pdT;emw(#pKtPvvL*JpeMoc
z01GHhXYp9AtlfBQYL(La%NCKd(%i$O-s+|7e7h|mE%LPc;(jp5Jqc=fiLQ5<Hj;*n
z>H}beXtSVS1J)eoC4bRpMl2#*B`RDJ>vP{2Jm5&u;`Pjn2TAO4-JOCulf=&Il%liO
z7Ll2@C{><+{4XVphGwqXa7p$RnKta)U&PN+E1MZ%MoQb-yT2d^GPzQ&s5^=PRM1oM
zBEMx6;F1^KtL^&`X090Rf3Qm3$d-{=Sx}XkTLNXis<C=0!0<R7H1d$f${`bfE2Qo0
z1ofgmWegOxcZY*iO2l>pXu)Xm&jhskD!28lm-6L{txW`o5Q9_?=t)#OxA#~se<232
zD0T$RMdwnO$Hz~f&2_B`Kn*z{L{*fpIH?5~6hHzZ&oTROyk%a}W!9f0KKP{Jm4u<A
zr86QZ^ts&UDv%)n33|{C3qqox$MKteo1K5W5iE`YP+5^zWCC2%iG5{|L=X#!mS-?w
zHrE2_^8yiZXl@l#*1f1`fO1pvePjU%Iv*P)UaI$TR5@joT5gh&vuPqgie2@ncgFke
zaQoG&d5(uDC!xI_kruC<8XuAzrt~nnb*}%1lJ4B}dtGMej}va)+0f^g`TvE&L7K55
zUijAXV~*|=@VPi=>|TK?t9b`P4MzC?meT(H6s>k>iBDToSyMISCupRO7k;nnxd5OE
z`pnA5L%|gUeJy4&Y&!b#mhhe~+pTRTUaz=p%{X=P7#P3u#PlR6o(5?y6Uzv9g}nX@
zA<zT7D)62Oq-ox1>UQSk4v+wz)}<^j$;>b3eI?Tqp@s}x4ZIY|;VsAg^UPePy9cqp
zL%Urv!DIbq;pyz(?pOKm*FP6O|K8;N)LSw@)*Y1|`q<&Ki6zsIab25Jh*L)g=;fRp
z=6cOrV}vl<g@WRd|4LfhFpCXZIcisH8RE?5T6yHwy-}Khn8*B3fXA;7-jfRHrKF@v
zRy61HL~neit2(y5Kfu0K&7*@G{3rpdEOWy8A<?@HM*g4fA=0*QUU*7sm1t4M;U*Yy
z*gQmzd<$&5)ihh``jrsoST$r?l$}$4$ijHpzwP(Rtdi3aX`tW(K#zdz1Yloa58RS9
z33Bq;Ny`MA6ok_JIwe4zMLQZO>f0ns1yYZ&!qKZ$4V7oHdKP1iS2gXjq?SWGEBjQ^
zT-%fj7We1-eNg&yME*h`7%$(_OynIbeFzAtTK>dF0t*9=Rlw^Q9+^@f#X_QxVid%r
zy+Rrh^8fUVuCTov9u~GYHCbB>@G`S9Ff;!#D>GQEd-bRn0c2Io_w-$8*02Y<IOwW{
z3-Ake)k7b*pU()IomWk!AN3->B{EZxQV~N3ism+uoYl^*CyxP2RMmH9C6;ck$#;i)
z%^%<Uh`N%0*U81G_BR_9qJs@3KZxy>;%;opp_>=WFP_%xgy`kG<0ih<HuCq>u>SwM
z&ld>_3#t~Zj3YtizqYk><?3lZ8zFu3Ta(%NP&|B4+|L0z#CHO91)PAKBE?@O*b>rG
zgMy@Cxsm$XTKa?H>E}?87y`LOsHd3`fAyD8t~Xe07Q34Z3PDjEehna<m0k4l7TKcj
zH)D+4i<td0K)ojarymvz?O{CXGn2Sk(1B;>e9o)<u#Z|<3ZN}j?pwT3^aF>DdC7r=
zmHYlP8&P+WQteir)p0~+Ia%uYgRD9&W=}U$C#|rFP|wmvLorAHX;xod54E!Jdil_~
zX}rNuq$CbIGKAF5DcUK!B)_=0-q*>%!wyZ}($3v$FmpczixJ<!*eNww!H5u|ejsoZ
zMpE3mxEHe^P7i}iNA=zT&8r#RRv(H~^tWLI_xRb#>f38oifMCHxRvc+lxfARxyBuQ
ztE!_29P|EC_*MBH3;vsZkoLZedW~Ud@sz!oO{Zl6x#zut@ii`iDw<6kv^xTgJHXBG
zdmZ$yB|&y~$UtFN30E@606xksTVOW<HHufDL@Cf@crDxbCk+P;8C2Dc86?h1exQ})
zQLWNGjL>@ed@d<DL(r2lhm)O^63Q{BOhiZAi8wqRWOeM#boRh)r4}wp83MI`S)@j{
zZKMB{j;+Q6s1daZQSWD`7m=Sw9^o~qv>C?2`DbrCxm!0v9ujE?z&rB@yd+=>74^0l
z&O^}XZbUr_;~6ei*JaPm+-zz{wc^ah%~i$C_4T<xWAm=#MPp9R(`__7)F3742ltrd
zrewI*gRpWKgL18mt=rT_Uqy{^yWr8wlbaqibtNo&>lwR48M`M>yeYcE@vlm}dW>SM
z#iBUEqpnu>@x|Fp`_kNdd)7Eilam8eM>|(Y8z)M%+(@mzHwhLG5*Hh_@Yl~6)1N-6
zomo*fi(m~(XdG!l%Rw{3L_~C$xiVw_&^dPcDT@>O7NQ7+krWw<`FyJw-n6g192m06
zZ*L|bOrt1>uIM6$Y9j_p%m(wc-40J%Vgie{9`_lqtajqoPLX+kbU!Z~b<l&jLHKO*
z;k{~};%ZDJN=yXR<U*R#1!mp6&eDSQTr}GGqZ^SXNY_;_+G=W<-=kcw8(e)y%V*YN
zmDxx%Yp3b<yYiD8NhWrJI2UrrRLze(*~;OKBKBPyDRts5Lq_DuTJ;**kpXeK!+ft*
zyfz<R#PSV1vsj1<+@&Bk04?Z0n~LD{c_zmc@Tc`}JE&eUIGh!Uj}LA*d3s|JVhiCT
z(boz#gLn;{di<xz%q(`^ZGGiZ4f~{%&%zA~!0r019EUkME0$K1-i`MM(&Z6G_9A@f
zmXFw0gWfv|;ahC&L~odxiV$hM+DES^>A~L7=c@1KE7aarriy=|>LGI(PEbnCrGda_
z(<*NLjkd<5TSw69JB(N3w1J18Hbhzk{?Har`O8>ASJn?DQTQu0k-||q^^KB>uufCa
zEv<j~q80;7;%h;lg{zjee7v~txmSDZN1A2L{DPF=&qHhcO@6Qkl`c+*2^fjQvg~sm
zO=Ue5ov$3kfaq^y%_anqgB8Y+KTXQ}ZAyi9|9lvHFPFwEkY&^kESuj9mWIJeattb-
z$EI4=1LFF}{r!RA3JI$ey!_n!1tVmtT36qkKJq`>ecVS#n3(olfR7e7LL&QlV87vQ
z2pK7n<yWJu<$bnz@ww3nZwOy$LDf^7G`6HyU_4S;J+eLDuZLER!tlVKUlXbpKM(!V
z2>BFV?{Sg*PD8MeSHmvwf;GXgopw7NM_Y^a&HZ|gYG<t?>>%mbR6RJhi}r(ya_SSd
z%*&rysvM6@3Uv#)GI$+*G@<HTq2uUH-39lGbqDeuJBfe#R#{k*pYvv%w`p(usz8(}
zJ*~vrssteLS?uT^6>(`6ZU$AN+PmIhxM%b(VTDfBS(1u6%Q4=?nptCcUL&+5;6ta*
zm(Wl$vap};WTJEs+q=2V@HlKuVHrExSEt0@{c`v>);+yoKg)$O-X;%}&z=beq-cxy
zk}AtjsvS$e#cUztb5A}+YfS46y_bD3PV{qo0+yWaxvry0s9jk(nV#3LZUp*f$puHB
zS>%5!`NZsy*r~3|0RP=f9}QDg8EYSghtL{=)UMSL^g&S0Qh7v?PGPT9n)Gb1$TeZL
z^L8&>?YW_zibKt}BxlNcW=cKb;mP-QHD+`EX?G{wI~CFlD7Nj~Ew)mTSkdBCe|U9<
zXGhuu#oK4hO!``5b4<bL*+Ih3ZFqo&NEJ$siRy-Xng{k4XSdJx2VzE(Z)v!_wAIS|
z&7;v04uDQ}`C^u|p5m~FY1MgLBzj+h&Z+M3v<E?vCNf<cj({`oNkny-Hme0EA@dOZ
z@$a`fo@f+T+Z8Ttd$BGc(Q@NGzB;CxpN*01(>nxpWOIxsDBdwqloz%vd%^z6+m|Je
z+~=b(Iciuq|75t}MJ28^Ls)|h51y%~1{&&yFa!s)bQvjZC!fRCjqfJ5wXLpD`huNh
zW^bT_>{?VX{|{>`&+qi0Mdr7D5BkI&Gv)0+9)iSR_^XCaQ>F0@xaO$;I94sI>=)&X
zW{21zFuX=I`_9b5Tem}f*l8C|TFfHUl$c>L5)w;}H*R>fi4P*XN7jk-!~+id>zp<Q
zYjhZ)x<1kgp5h-!(bTBRFrgBBW=lToV$Ze~HnLtY#8V!;fsmkm)k6oxT=Fl!&IxEw
zGE@5EyX2XE@4nn}evu*C7x)t|`LmDdhsO!@!hJcXdE^Nng&+Elgrybr_;j#WtPl$t
zlHMf{dO`cTqN=GY=V}G#44M0Lx&{s6Abl5J$#^^xdO4Xutc`w05Ti(O*~Zo{P`}V{
ze`E*8UFXAxg>bY#Kl(cFH8mXs!kb4+YnvW(9y4M4bA<DsFKX1M)=XFKd8>wci79`0
zql1d$p0-8sDS;LR`gR$DLtEwoWy>rZHH(GE4M_*U9uynic5vA??@GugGL%G@?Y(>;
zyxd)f$+m>!2hU`|kxYSS>8U7_(PHg-%P>H1*6D>Bulf)~Ogj2Ai!Po`^&6{mLKo;R
z@;##5<R!Ld5y@TGRygds4Q|jY_x7+)M9h>g`H@+$%fs{D>e<SxrpH3p?08`f2hk@t
zyRNo1JLhD(;~u+u?|s^>zKc>r4cqDUCXZfnwOOBB+8HQyq~kYRO~-BjP|Bf<kNPgO
zq>#p3dfDAwYB%)5)rlufekYAVUa$S-Glfw8sPAQFny5HD?DtW-2x`0W5wCz47GhxY
zJrFzN(u?!etUawgG%jr^OmZj88udMlg<YR6bOtV(s0q=_Nc&#y8QP1^PL5L2sJP4&
zCb`+3ggh3!G;>ub9ovfUf6iO)5ejExt)PL1wi|exj}`i!gBCJ^Gu1f60rN@5s~2)*
z%gw&WkL5&gx%T_?uC+3gR{Cle4^@jqkb&^h#joS@g~H2!h%ufi134`_yHHoJlBh;K
zmk8}5JYdeaL|ot1EQ0rEFnI1QNZ8JlBf?-p`6>k+^mG7A&tHyf#-;!2@e^`E(lF7T
zaSs69(lR6DHeXA0O&%uN<aiZ{omWu~z(KPVV#0mY<Y%d_PZK4dA*b?Sz0BM_M`=p=
z=R3KnBczX9>C~5@lG%+}TmDqLlA_Bp-D+cynq7a`v_t1;a}(0T?*lbd(p0@aN&;46
z-9008y6{$xdVj*vIMudkX_Rpaz;Lwsw`L~gu;i3aA*$z`6}ZVw)|wAycXoArmqrmM
zbB;Fw2L}g<OuCba&|2fEMpvswF1{0pluP#dslbVl+g5^g^z~XlhZdtNAtwxt9)f&i
zvr;(j9ZF7m*Z<k4>4kkMqu<q0y}9vl+N?qCo$<{RiMc-|qh>Dr=c01f0t^meM#E;T
zV${&bg6GQsu`NQBI>)rnzTJ1`#P)n<7dEo>s8v6P(ES;n`%aM18pW%3mh7&xS9b?F
zuUBRxk~XdOyOb5KSMzg7J=dsqO^r<Sl3-(Il>m%Eru6CxAu39#%$cL)+)bgKm%Bo=
zx^{y^GQf5xh{8l&_>U`&qxvMD8L%f?ae6+(JDalfHMLxpNp$FTIn>yn+P#kDqsI%g
zs$UDSioD#eEr$s<4Iz%X)B4bT8nTKT+g2QGn~XmVtpHi`S(cMSNQkOJTGnRYS7cfJ
zL;0g7Oi}Def8oz!K!#NC4yB51&^iPe9JeQ4yX<&@{dB>5`k1+Obd@{9(^O?-EXklY
z5%6Yk{4)ZLR;)@i^tsdJh*{XxqI#Y@%s8<_mK^?jWfs3oX6~%>Vy?+%$k)sEAl>z5
z9^rbkapX}dvRsS)xDly!*cYI(<%@(bK_!=Snw!qY)ZcTFW>wQI3zfhVnXX={9^MFD
zw3m8ZoYy)U6kQ&&^}_@`GoGC~*melb-I)3K_-q|t9AG~=y?e(+tmI-MIZQS}KELaP
zh?DrY^v!>20cz69F2e9ngGDTl*)o5r3rpi@_@Z6<$B0~1?)DYiQz4jaNy8hYo_lOI
z-vA==?=4Z#RU|VYfl#>F8dDJ1|D-2yx|h{_IkIbD(U^^CQ!P<gwJf9tF7z@$<fuJ6
zuCK8k(96-f^c~p8Z$3xt8k%&l7gxx9Qpf;^F?wX%Rn8Yzv3gY79gSUXH*LN%c0&m5
zs+oFlNx?<#zY_A`#tQ>bHdpB7fXQ|(eSK5eMeM|yoL<yBSBg8HJf_q*zpqmg@5+Dv
zi_z7JR=dHuy9ywyg2fU)>=?VgK1R6C#Cq=GXr|X=rutIjmuoh5<7+IDgpP0#_}hMq
z<UILlSW0G5{k;OxLOXwWI@0m%v6Sa-HeGSbC}H@79k<s-`jq7sS!kao^wTfKlbiJq
zg&gN%m2{b*p_hjf2h$epW7E^Q&J)9jfYWaF+Y{MX9o5Q*e`Rmir`piTolLW5o2TF2
zK#cZYh08_t?H-gi9oMVKZ1=SEE=Fx28TfV)CPeo1#B`^LiF!|Imi62ba#iN(*$Yg@
zW4tl(7aMDy<FM^t05H0Rv-oD`q+d;Sto&0AzIfI~YY3IKRXBFsQpry_$9D6e(d^;m
zyRAHzx(uk(^TO?iT(7XB`x-7?iSx(r5a6MW$_ty8Yd3k&Oien{@N5g_$koB6bn5FA
zw|>3dZK%FT;H%02(tK{;5(vIDfg2#KB1Nfz=%|119T2MX=egJNemc-JkmzJa2ISe<
z8NJW7k-Kqx+eU-sh4&DWWYC5gulm>eW{^<x;ZUb5DBC}~@`;_`=U~*!7vDL#*q!v5
zTgd8ccwahV;fdI_6}swiWU009&VAQ2TeY$Sq?k-1lY8U)&j$b*G$<%)Gopw5>)EYP
zX`G=Fr-lOU#iaXt^=7^;x?}jd>nla{RZ(t9hDzn_YQcn?NB0LS^j{NS`3Pxl{S3lV
z5IP_q;0HqOhCyDZ!L4g0(#Lkz%2kgc(z*q49^g12v}(zv(1m&L1$>Q6_IVNKq%5a8
zCTFmmpD-!j>nUBS(kQmfy^Eal<?ysXbs@IuCs2^)vZ~wQ9E|S{pWZ#W-N0l1L%8qN
zdA{RXcNm<6?iBFi91z_-+x#|!Y2QNK1e)jfhWhyxR@3<Ou75H601<ODcXRL)`gMt=
z1M_|Wl`rxP9IhRZm33Doo^Z)iV@tq5qI9X}h@t*jx2^Q*>F>^Cg|@G}{rGaNMC=a~
zDBUlE0WJtY)(Kb02p6R;Z8%=F*4}9XrvSE`)@2nXN_}(DGxe5_vL2|6bX@_N#Lt7$
zogGw@`x>%cUHvJO=fzXI&EWZK;rYQF86b45frPx(D4mamMulFlJJiBX&tj{sYdxm~
zguH-Mj`hTFi8p4`F*kt_gDKVWJatNvN0PRgU}OEKsBbR!<y!g!;d(vZvZTR9Ibz}A
z(_|P0k(}|f761}g*3O)|=Z^o8L<^bzHdCpztty%ItT0mHBD$Y6_PGOR&1v!<U?I_3
z>-*H<zSg41mSif`%N>rEFQjE9bU$Lgmb>HMMS>_AHFNdMQo1Q{rMUuD(DHKs;vV4P
zpi%xTmW9V?qXKt=M!_L|p8eYQ{v9lXi&kb?=hFr1n^69l?bKh$i2h%K*05ha&=JB1
zHT>6_>E=qAmE$u$bidwvY&)lC;3tY@WURc+7xq&^)r->x3lm~nGZkqP{<CEYK{5mz
zK11=Z#Ca#KTk@S(;71KPb*J($qy@|0sL?XL7+;i;5~HrsC)A6Pg>>tqFg03S-Fn5{
z_d;z|-}4+JQK{ExL-P)vN=zudh162hkJv{u&wqsDKWLM@i)u+Tk36L#)M6csDNf9z
zbJA&7gE)keqq#ke5rxLOIF|}(QGll9Z_rXkQA2AS$xuJhTqn=Fv%)xFRMGeCkFeD4
z_!Hnu=EAcSYJ#6kj@-p263uu=AWn^=L31Zeb_01Xm6L{8khUk5vUB_fT~fRIcWm^J
zjz&qT-CNc5{9tevVMAve$v|P1WlC5l5kz3+*UEzwj`l8y!~2hJ;nCEbiNR<Pq^JSy
zpr7z0UJN`AJCSR$ZYR2`B=)2!=fk%TA|FD@snPGe#}HmNC;OuK`tM*irQKU6*;nJv
zp=pVbj0{)n=c>0xZP+jA>^uHHXd%!S2qpztF87y)l8aLx;ZGfK=7Uf~Gpg>8-n;8B
z3dU*>?f7M@5QB$S!gD8-AF2_?4vDD(wmF~>ColMo^<8$%`>0{SK<4tmX<i+9LolC$
z?+3>ZiwCqdmqDQpfxpqu!smYF5EGr-pl$HJTlb#32Vp-n&$>pUb(G;ws8<&HekGbs
zQN19>kEn}KIIM55*f<)3JlcG)t$sTeSk(ABno0r=?%Cs6^*77rl;w*2I2zI@)N|`^
zycqq-wo1rQ=iF7Vdx=(V=1P()w{b0&WoTZl`Uzu#9pLa__zAmpHWOqk30h;kuk3z`
zBnY&!LPbl!^Mm95dV_-Yv)`0Cyuk;47s~u7F6l1nhAaxT1P=Ng91LF%lZ0WoZ(@EB
ze2B*=%L-B!hawsU!A6DP>?Az5^>GZYxh!B<G1Jl`{lJ2brX>8I7iElok3})5{f7On
z`z_5Qn(;9vQqHeM30UMGxxsQ&mMe8DZu+||3*z4X*HSSSAt<PdQ1Egh=A&%qtNcO_
zwuk@s%Pn8wK}yWH|CsS^qji&twm4^l7@v3`F{u9z51vSVcUp{ZGP*)PrBmqeKjv>S
z29D2Of0GJ+;!U$}y8Xo(o-2=N6Z=mCY>aHbe|NaY0o6`x-I~+z66$#@q|-h%_CE;M
zMCU+9uoKJd^B9rJ{JXj62d&175O|`v29P%d;7a^MU4vWu&6ID(%Ji)C#In(!0VsUx
zYToGz-h<N!aaNwxkI#Y7Z@Ev80;LHJK*4xL0|vb=AMB{RoYPC*NIGmUO~Hp2I;~0t
zbr`s!zjXx0^djF+?2V4u5&(P_b$yo0>@t1W9(BWezUFGtL_Dgn*QmSv7<J?Azo((`
z9(Wnkg7#<rKWv+x_ZNK{>!HNx>l(SA-j#srD?tt2^E!>qO}ja0J@)nV)$W;V_AR@@
z8fe|~azkW}ZXMao_%ti!hsVZuE?pfvr{?Ca*bip6eeIwd0D^xt>sd5^(}Bjml%Run
z#w{cKvuia19(+s%vX8D!c}9h+3dS319whZzwgaTaVZYKV^OBpnZ`@iPS+1kb&R{|P
z{d#H96~%*WYfauWxk=4+_wTsI10CPrG`@7?QOjnSI~EHswA?)2=EJE7P``Wt)x*Ut
zeE@Mlv&TX4ccV5H`4GZ&nD^FWbY;#<@meK;Q+y3O+EyvkcxzEfZm%|=!xo|BWtq5G
z8z7!fmozP0CE@j4e5(6;dQ6AdwBDxL)IzK8dV~X`r2bkwpq1D(-k1m%s?cqkpqnDd
zhMy1J^*DMI%I}S>`H|ND@BT_44uc%m{X>a!gm1I2>wCWS8J|P&F$<5Le43G4j8ehB
z94%=dtBQ@Ao8*WUO_YgL?ymaA?0j}}rhgI0quAWEKU(kWy>gWFheUabNx*UE*1>Ef
zvvD`;Gz74$&se0Cb(K$IB`9yMBdf2riJD2{qo88mR{^)CH|LDg6vEZU_xMbRRd)MP
zWS7-7aD%6vrgT$nCy<v|%%3aI(eJxnBgf|I7ZQNB1)e33Q1f`n9(;+9;bkGrXZ37%
z#;*IZ&uPpT#qrqzDbBis#O4ge=kQH%m>>|bki2$%dPQdIIiD%fm#7+Z#zHORS+Wfl
z>2<ZL?YqBh+u4zoh3V3Get$}EKSZcebW|(l6<%1)Oh)4M`DSjaO}*h(qeK*??;=sN
z$PjO<v)pyiU&`U(ft!JqGYeRJyIOUe_q^JrE9j~~-z-Y>`x-XsgxK~kM|5@FA>9{r
zO^Rn-N~RT#*gq&qxVk1_yTe}Jl?12HOg^mu3pXJu($Q3$8n*;4wwOR>Q`cBtAYM3>
z&Rlp3NXAk9<GVTV5r6&J3w>{Ux~>PCap+VVB=sFTzQBwo;L$B13HsVl{*vwhi-?K;
z$I|yan7fUeC^ZF{t$F0C$#)H|Ij>IOhFe7D*5{p|3wf;%^`FELQn`DrflFUfZ2M>H
z5tYy7#%k*B>BbTtC<Icm=(UCU9G~=G`^&iiw>DmtgE#%!NNU5iq!}6)%YdxA-%6pq
zi`d-xu4V0cOi=?2E-yhn;5J@W_|A{Q)uoRC^q?SE%J=;TIZ8*>z=fER<9fTyg3c8$
z^z`pt7ZsB)ST#&(kyv|G-PCwGq5O4o;laJBlIvpKPrI=n7;kmJ++?MOUc~*H%10)1
z?x*)ghgk4yDcYfu&kBn}({BpDwp-X-ww!mF>Rq3^-ZZj5QeVFHe|^0@&Z_iJ(5Y@#
z^G!9N6&V>Y*sYNUe8)x*3R~5#BA^iZ&aTaxSKPP+sPDo}K8<$4%1Y(ANVJ`H9^3&=
zK!7+xZRM6kWVVemPbLt@wtCfJGiCu_Hu~y)g`Wr@Py2v_6w<r!*w<sDpBVvcIUVTD
zSSjo`pN8;YPk;X$usDtEgd^aIa;uNgZbKQ1t7_$S{cH6;2gNu%So}x$e>F(Url`D=
zjq^RM6f0)Jb*eL!qsB49HvEc0oU=)hnMl}8;L#zhzsYm#njYR?RP|Nz+2=9)kGki!
zySDY-{aNbMDPvnRUfoYdWv35ypmP8t8GN;4d+=Z-zIv`Uov>Dv^*7UP^x*2UyAjyK
z&uyB3b9d`<s5&!FiS*I&f!4yiLbI*c&+vWS&*EKYr+WL?i6ekljlbdN3tW}6dm&w4
z-#tG>Y;j77=Pa8_n4Uc1nvu`FrB3Eg8Y}GN33tG1Y~st=EH_n*uCdKGr?abBonKFc
zS*Z0mCx(AqPK{ZYYcM^w@&Y^vad_gkN&m0FE5vp0VOvqEr8Ab>(modKD9}RuN#;+e
zXLDh)+y@eTDxoo2m!)i#lK*figv!uXv}U80x_P6jzKIT59uCBTp4xJ=_xVKs$u`Xy
z8u99B*k{Da)q2+)zue1FtLu<x%3HP{hZ!w`a|&Z{Xq?>x+jyaBp+sZqEhQ?*@`A>)
zJ(QpM2Kl_X@q+DE&ahB7?~n3DQ!;R-$Zo@ga5P>bjbpoVbDO<q=u&(85-t~D*Yw)2
zUFhhP#Q)Vwk{{tC_X{~x17@x-mJ5sEzA<Mqv7L(##4lV8Hy5AvR?vQ58Z-F0H9b8I
zf)rm4duo%;mg)aq)@Y%#G`XsafN0qH?#%}=pJN>oO{Wh&KHXKG3h9E~xzEfw=gM;`
zi_`g(pV^#&U{^X5k?ZPU?$Pd?+f)!1z>-3$>-om^9-I-qzXVa6Xxy{@n=P*0>w~+T
zkNEa`9DR*!-%vy0<z1IPHy}*n=qW+3X=!6)bHJz#aY*>^{~pD&P_Ko3wny6eD@Xek
z%%bR;(3`1?snG7k%G2h?W^3C61BE^5h05&19>DsKf<(IM=0Q;7*{s(A1xMwPcx-+{
z`Q`=J?C9)3d-!E0Mf9VG*kU1>S6j-g<A}xfPe6de(p5bgXGg1!fxwpO`DDDhy1H{S
z&~|eWA_PETW#zgpH#b2Xa=@q#8~}((9(lagyO{^y1qh_@?(o*(^NGCd(B3U7Dqu<O
zHSLeT>OA2Do$TQF&WnK8$Lmp~Q9J-QQkDphiXCv@MO^Rg!>$I6IAbwSq`QE4@1Tj^
zO-qV1|1o*u;u|W6G*-xn>ka%X$v;Do>-c8(3rCG)-s}z5M%<iaUDPn?nV;81`+jpx
zT_o55TX=lwCSl?-D{Wpw9W6>t7(W8^+`22*%w<coYgx3jJO3;Q>hbbZhq+&FgU9=u
z_ZEw_tGsuEwS7Pc3J6h|>aPP9Fz@SV;2|0U`j3aqiRT#JimWZJxjGQ7f%!CrsC25U
z(g?Y&?^X2Q?B7P0y?5`@=E=t*9MJ2b-tMP)H)lYW-tzfvU@zNTH*aY7a;3C%&TC7i
z{ynXws)`!3-0G2s9{4tocC{5vG_3Kmdh8vIo`;<>C;DG&STYLU#h_LB!d=YLMq*zY
zx!Tvi`%=^JbtLW?siREqt>!#94XfO7vj`6_eg@o#e+=aqb|~<4UN#y7e-C*%5O*d+
z<g0edKQ<wg=Su1G;d-#0<Gyra_th<j5wLrAL{6bLcg%Eds{F^2`uqZ9Pisy|lxwwE
zdAg5N=mj9bYi%&j6{d^uYTq0Ei?yE(mA>rlX6#@SzJPA14`UCJ2Z>Kb`>3W4dr1CW
zJjj;%bgUdr=~9NtcDc^ez*xM?*W1a2Zo1_3w%x#wM&2Dx02DcQYNrnP?d|jm`;A0h
zy;Rf+O#O%bkN%GGMPqq`%1BWJXFm1c?V&&qD9oWk7<-ccvb7)F{b4zr6z!DmUwc+~
z*-JtTn4|pvaW^0ilmV~G@84+xj@UmIKMv0v^vPKON;L4RZY{byp=q6vPepJ1Z2vXr
zQAGP(v&F6<Ei`vR$={Hp0XJif1x{`Gzb(U08mdiBSvipVC&iSV?$W=dj{eJ^lo9dN
z_Jt23UNer8M$%!jwH=ptTWdj9mk1*lNm$G;E@AWtH4f*C?DDMeJL?}CvM$xp`)rPu
z5qugHXx||7rmNF};-*!Fk({;=xJ2?(2We`~@CT^fw{KWv^={i8;KF0*F|vj3#Lg6R
zPM$tr<Hd0=7FKbnGDwCb4r^YNKYkP?Ndtus_J(Qk_NkAyPfTLf&mNy$+T-ctV(3uZ
zNfEWNcXKk~qf&h6{t<-{?e<6O?cp#TR?hvM5ZKK|yzAWAe8v3_-Q^~YbxFj&#oW}@
z8VIdMmcC6KcXDoxA5T!(!*r~F)HdAFLS_2>0M!e}&g0Tc<@W|&?a1osLE&y!uBk{S
zPe51EyMm!N%jyq*DMByqs>=eux=ic#-AoSjBpj+%+<)f3jl^2Ydzf$Fo%IxL_WJtZ
z;cr{q`<g{BZ@NryeH1xUFEml8Vkw0hJ5kQXc2jD*<8#u+9Q;J9y_w(6PG1r)Oq5Bd
zW^d7QH(ZNX^7q)>6<nXSV&dHEy67^iwE1FyVb@DTcTRM*Gau?mHbrZQ4+gv#<8tw>
zVwSLFhdS$xA63;<9vry}`CQFJTil~&RURJPAKg*7IbKDl^7d)5a4ggAM8kL{??(si
zH<Oi$)`R1qeWPjV3=&R+w=L}PQF*!K7V+ZfkL#;5;9RX7ov2`4+gRA#Sn0_>!>e`M
zU#zF{8JHb$-58{e*Se_Y5_+BTYUD^R?}^7yCw-pljpeR?LmO1`mqBGCvM<ES#z=Rg
z#>&TMcERa(uux-sTmZE1yHb3jYJtd+Nt~=dpPjtgxV-cNsfG_@Q=QU%x=AqCy6Ii7
zy<{gHV9&hwxq=D$-p0Im8~1^%n?}_qMZ<drSm=oVv9pJ;zxh)Xs01}`b*Ja{@<emv
zS?fmmCJO!txJhx^b`fZTAwx&i)k2}7)Y6s5#WS$-fwJ}G<V}x-!=CLB7y^jboV5b2
z$$CBSivYV*5ee71%q;qeI2L@^cGqXQevnZU6`h4x@pvq<r?yS)`f>`e$%vM4y&CN+
z+zJB4nN05+PCGoQ#<(~_L>#R=Wc^w!0E>e7!8@R{P21A?*O>WTEEG%xanHKgR*azF
z14n1OHz48C_L)VAgZ_ZQ0*%&`;r7@EEpPOb<^g>_xN=t@qoWk$sRZonDl0;IlZOYe
zqSMcf6h`SiD{heIp0UbEG_m&{B?S$28NHha5jw1Yu7Y%J>jmDkHWycLnJ!edEjUpK
z*q@J73w^V+Y6A5V^-t)8A@)Ev7{5tN(4zU>MFk`V+59&48sMqh2dY1%5Y~$1uO8=u
zX>&7Q@c#0)HMzdAxVTa{7Nf?s*B(nQ-*gqaoVNPz;COxdsYl$N47wEpGkwN{tP|f~
zSjw?dvqfC1<PXM>^6(Wk)-`R2nv<k<fRkTZ5&%e>+vKu-!i#^yK+kDVpj(l-HyrG3
zXvz9zU&Z&jGK!KEw~FA*fS!Hmuxgh>(F-_7o%*cDoJ9q2w^Qyf>?j;(l!gl)TYsOt
z*&j$Oy;;&@u|EK8CX({2lD7@^-kr-Hn#wo|E`h&N&QtH@8}QY@!LzHMCDqXx+};a6
z?~RF2ovoi5y(XZxfWcsMFlF$C5j^GlmuUZlt>W(4UvIj@lb@jCoPpc1<BPZg5&&_2
zaWQcwW<)$$&VdAZD^~3$IGpC~1n6Oq(nwSMmf?FQS1Tgr({#|f7St)<F4{BezfkSs
zdtNG_NiUyB!Rx_le|tg0oEE95ylc01cXxYxxV%*deAfc{!To>ldy+NwwQW8Ka19m4
zrKQ3sgKgu@xzM#(U60{!KNv{O0|5@AbiYAdDlD*SI4_?t6s~_dY)h&=JrBtF`y)4F
zrFdMd5J*+mi6LaLZotV0dr$MvNQWYeZq<P(HfkKMAAE@Mxwdsw;>~sc3~alSFKt7O
zGtQI-M`MkeE6Mt*ZE!VeHs!veT)?;5ELVLEiy!krh`9Vu&6A9gJa3oDp0XQKy#Y69
zOKtgjei}Y*q*?i;Ay;vHlIQg+3gU!tt@_`Vq@b#V#xV98()1BCU0Di+T7kdiDJ_N-
z8tRtvvj2QqO>upZ)}L8{dE|L*B~xPO+~|m&8Y51%^F24a+Rjc;EHsrqU|WnG8RdHP
z$g#Rl9!s0o*n#!}AD;nt?=-pG!oPPz>&2V!HNAHKSb6!A$5P4flD0pMEa+&N#E)tW
zL9}>%CwOqq#4&ouC2%CuTFwb-jdax17-Qt+naR+T-?=#%6clN_=KXGfT5izqmtCY)
z_O6&{n4{9^B$_)dBmNe$y0!MhMs{bX&Rb|bzmL6y!leVUPNn1>zK!w7$IbhS+OEyw
zzVUNsHNn*C77ri&98IckEX1iD)z47ff*^lo;VFGY!6*uE;U%c;+I(JUV+N|6b_){v
zn#wotSM=i2Yl3MDZO1o|lj%0;1JpKKc$F^gQdAY|S|jj0!h%&xMr2)X!aW<x;#Pic
zR<QP@xc-hkspT~e0(Z}omO*VW8&7>I2>-t1e98jif*;XFGvS`-5`|=<ER360uZ`&e
zcCTuBnJ@R`YVDvNn9kNNPM5cEP<5E|GYk~&V8$ot4o?rT33A{oZ!LyTFJU<yJyIX$
z9L4w*i_`>+4s8C6I}gkQXeXpQ^2VodKgi@M72EDoe*0#P7dmNT!tpF}2!U89DJmdR
z8+p)<m@!E;v=CQSn-PBbOZ7-(@HHeoErDTp<k8oJK0!JH@DYV{fq-~tIS$l)cv3{M
z(@PaCMN4&KU1e*h>t_>z(w0^`>**<qBb=OI6)s2EwT;);PtCy=)F71UNd`~L@EwBc
z-jDyjHWFDN$*C&Rns;*Vk`0)kv)L;7W?f`$b^8?7ys1drpj3HgZUQ#=k^*7v5Te-D
zA~{X%b>^PP?=VvTn(~r-K=`YXkj)^*bdOkPeGM(eDdIe-VC^d7yqWjD?s7|m_lZBq
ztkqcM78k<EF@hZS$zvFXAARNIWJn+QnelSl9ES#cKx7VLQTWaJcF}6z27YAd_lU59
zkQy=S*1P4!svST&GNmP1aHH5=4?Cn<2th^P`Zk5ZMT0s6a`uf>R?Uo+<9IrFPRcz}
zzmmGNgs8W`Nj}llLtOPe&7buwDrdwF)y58$y@wb0c3|e9$JGGc)qXySO{ABZQ$v1j
zR&l+aZMhaj38?WIADj?+<y_ZDSD4k5A2j{(b!J(g*#HTnjctp56*zE!oJ~idW*$jc
z8JXVrRxPAkziI^Y7uI!#3%7uDWfQpnhcEncs2`)PSrF~$!>vV)b%)k}5ty%;;Z$ME
z5ZRKRhjfw#7F(nbY})k|FmXvtr7=ZEn_FxUlTy%kTE1fiXHdnfBcE7ZTo3c?q_Nr{
z{~VF}3iRC*t<87nv9lCw98cp^IjM1l#*qdLAySh^;>OrF<<#g1-WS6W@~(gcUKSl2
z$O7jMEq`L_=rFm{2dI!3h`;l5WXgS9hTzjy4>xwLoKLaAhfwuV4zQ<qRjfLX5HEim
zspgl^3)EI5slS%#vn`J+SuyJ+3$fX5b25F~7nAkqbr%wsJqG!b*D_-7hakv%AC2-y
z_45nx%L`>?8B$XC;M6<Ki&2z=#3h5IS@Gs42F4zCMn!L&YO7|32U$3>7E2_Ho+#6p
ziYz4b%|4Y<Y0JV_{y8e|LYO12r#ZHah1Tek+aUgBkaS#Z`bpGiU(-&hIJU>#*6>O3
z<NF4gY+}Nb`hkq;MZ>jcMJ`(7gD=#-C6*92qH7V?ia&8ad316ZtrG+f$%&n|A?>Pp
zNUiYN<S7Z@=!dtP*YbCR*aYy9rK1PuRI1~-o9@QU;0y{0|ECsU<2#QoZ&prDMQKh&
zB7b$&OQ<$Q-#09QM21jQ#VHFh!S8>vWigr4)1Fj`>YBOMk+mbT)#M76x<}!Bb?6g~
z8FBpyV;Jy&BaTO3f7j)1mUhT6s7wMK;PJt-v8?iI6?Ax5Dl$#i+sLZC(bMm0VtSyB
zkvO^ehd2n$1F9*(c6P{CkIg`(Cy>h{J3q$BS>iOHuE!6-u;a&$pUdk6&u#pGMsW`K
zR-~o>T|y!Ni1WmR5Ju~d_$H}YU#r4ax^x*Pe>D9G5iyDCT3$TfDa3If8++PQ;DqBY
zYxPqBa248vqPNg$=abm+-;vGT@}H*aYUQ<3f~S6fRWL<27ioRr5JlJT4AmK)B+QGR
zHi!<NUa7r==qNjO5{e1FO4nyd%(hHm%rw%Wv-xx$aGC$(zP+}qt}>SUwzYX=8Uj~e
zWFgF4tg&ol<5>4vyQb~kRs><CoK`^cp$3=hVLA00uL+<)g=^@H)5lvrDIgOyO62nl
zsi6j#ZpI_101&Ne0-O)f3uToRwr2JXs8Uj-{S%E4>xY&k5gj|jf7VZ>8`)T>6XoS&
z@8S;qC>c#&JOPBlwzuVqClw%K)&6Wuzjvznao|yxuzmoJMhG?iuK&}c3gab6vecv@
z?NT|nxq+zsW>of?2E+z-U9iDnk?)~M4)x4>{FW<*4~6wzVS#*&%JSVdr9pX_@ZkYA
zIHCLQ{7TeKR7Pt~uKC6`EzNL8o=HuLDY`b)s`5Ttb$;5+Rpx-bi{r99Z5TByNiA;d
zRaHN;wN~jjYb4uK)+e&XpN>J0bz7gSuuWBQ>f7)-(6xthF{X9naYhD+e?$hF!LdGL
zZ*w9WY8D_6dAVj~N!9mcIewnvP-f(Zjlh~KB+W-dR*)uoW5DEskM>z!pCWvE0|{<O
z4V_^d10OC%9|wo4$}lI7&VB~gE=i2j$s>s^FQ(D~1KX{gQd^6s>AeDXKPU9Pw}q@f
z?u(R{O;nXYXZkMJx|~C$6L5SOjiJrU$QjbMK|)TVT@{`7@Re&_1F$s2ZC{q+mYEO&
z$usgGD_~~&{aI0NQC+Kk6=THxv7{pGXxMk&G$7driwjjGJkuFD(Kp6dp9DMz>eA=3
z_pENUa^w~uZPcv&!zl0q1{BFtg%%ll=3T{}dTG2OXhk$^BO(DPDfdV}gtTj2RzFsR
zyexn;YwVhYSj9&$V|#<mV2E04y><GMqLg)VN}>kxN&H}_5+Z0iX<P(Qc7;F09n8k-
zzj9Nnn@0k@yc~Lh9R=*(dNTed;efJuZ%hbpjRKKeN5)GQ{DHfQ%qj)W1FbgFa+;NX
z4_yG-qw&ZiBjUi`4GWF+RLEBOWo_1|wY<8!+#!@dZz_)?X)9F=g3OlSGQvP53j_v<
z`;S-+PW+po*UOPC>f5r$dKhotTMdHC(qz!Haa`$q0!o8PMXldvEzMlf?2GN}vor>L
zHmlzGIgi-6#vK(XdWqUv>;J(#QdnnfcyNM^n=aN-Oi+-c0$g50T|+hov5jRXYePr(
zNbr@zWNY0$^tHP4;H>Bn#$jAdXc?J<=as+}4KNGBCv(N^`ADdCu+ZwygMs1V9NF78
z81`XfS<7O1`;(K!y+t5!9j02Jv=})9Eh<hWgdYmz<Tjko7Q+1+kM^A+nfLdSDYE%+
zew;K*#3uNMg|>DnE6K`Ks6Bpip8MqZLoJ}I#ly<TUeRTNyLhZNEr>_M9Gu<F9%<1J
zg21#B)8nWW_&*cmi(4DO2C{^QI|O|?0)Gy%6#x+!Dg4T2H$%~4X$<5=HHz7orscpY
zhDaE3&hCfRj?3Xl^7<dWK&~@rO=XpuSpy@_{R{tj_~=fM`j4uUvS{Xt54_g&i>EKF
zyCXjCi=uchOGO9VI@@$*f?|}25bQpvg{!+<*0)MaJEI6M_Ml+`i4G$tZ}xlCYf|Ct
zCRk8q3P(k^+i9E`9PqwO^e3q)Uy~?@rim{hRZK$pH|%jpKBA1Ejn=~5!?W}IpHM3Z
z9Q>G}7L|AI5nFfz*#N2kZUx>P>v{23VLy;mSMVSg4TcDF{2y;ZrBjE15-=qSb>6E`
zM@y5W*(C5%cIO*#VO0Jm(Z58X{MUfM*EsDc3NR?G{a>CFqlU#<>QGXRJjDw__h+XE
zB@*<@r1jlP$a4@yBf_}<(b4>(uEKT?x4DLpLN)+}5bd-E=NW5`Xs*uEj)Y!%{ob)o
zG|b-p3cFf8lcmQ(&YTh*B<;VXv=ohIzau06r!L`?1hgHzUABxc22rL{x|mUF)32qi
zC+7jc;#okF_8y0kR4Sn-DqCX`2zozw?wlJ&64`3yTGIgjzV%pKtS{R>5=3n(^v34-
z)Tn)NBo$Dvg(WjN=o_uN3ZSvO6K+=#okXBP$0bj~vP`ze%%oC0u@ly`fJ1xt&LqZd
z8A`Fafq3Iytz>lf(!nX}Y!Iot7FRI{b5(Q29O48&eB(!rDj%(99pTf;S?eoB;11zl
z1EvC2@s>+s=c?U??5KH;jYtWdc55@p+*{JWemW-0;LKamzG}yafGErfI(%x&Vw;L3
zYmOnq_#<{(0!E&ZotPXLUuh~+fI2XGB>cm9X5$$64}~-4-}z+K>bor1w5;w|jObTw
z$~^(lQ^5Z~sdb5Xk_!HyiFi`qX5k8+&hyOF6~M7pS6JKR#>e!fIe%c9)obKfEHDj0
z$N2*Fo7h>*;st6|iik)1g=?KRFydD2G_4iWQ>~BhWl7v{_#T;-kME(QLdstGcPC06
zs<|!v{Lc7%dNpFSJ?qKI_oa~yoLVj%6*B9WjU%gvr%905?fc^3GMP*McO4N5bDnqk
zZISo(_)&X?11OeZc6HyDkDmFi?*zWy;3{@wC4ak#XVQK;9LM*3`KAZ3ey`YnZr#*7
zipo21bH>xk;&4Ek+Pe*lr;1o+3?zXyZDsh0SgMg(d;R!UF{a`m=2PoG(+|#aip-92
zeR=`O8#=5zKpZFpOo~FtyqzZn6SLy!PsyWo)sE5jp(<B?wPmZrRy1IJDPcTQT+UYN
ztK*;SXnK7NS{`t+>KlCiA#0G|6dA;DxY7Sh^bkkJ>Jz<f8U^Vd5jt>?ENGt@97b1`
zuU`AmiPrTrUVkyojM{0xJ!5ql{RQf9L1GbtzNR~LSB%(D`ZN2iB<?5}S(@;mSYKV5
zN^jS}3RY=Bs5FJ|?F({#Iug@WhwN1OeUT-J7w}p~mPdf>E~s9iX>s9%XCJ$XKmqjL
zwd9T$U?hxZl90*Ygx<IghcnjObCWc}za9MGiZ9UbG-2$~BXf97vmVn4qo9pSLX1)6
zM}r{_O&{ILqi~R!i70c_d5I6DL2*iq=-)_z7l;(Z5*Ne}V)(U4gYZJr$5ZKFEWWqp
z>|A}{J2F1DaDeC=sD*DKx)XXy8q8@$ipGfzf=Q*2Ci}jp+Z*5|WISA75Rek@Zzya}
z3`ie?Zz5`7YYbrcGt5N>+|J8LP-`*A-AU?+i8>a8B8m+SK+yy5S`tr|nd6Le|DC&o
zC!{Rz_9b=dfHUyE)c={BKP(ed^Y*ffg3vZYXsIaq0T;GE0mv@l<8p^=(9IvCarTM%
z3#<eH8Rm*Hzmz7N(M8=-2uJMrb?7Ua$vMP-5I6iu)r%H^8L4PoR@eBN7Zo`IHd-32
zRgNlv)SN>*#jhRMQ8JVB_%wqsM17Oi=7f)d)@DPMRXuJx3_d&5*u;#vO+f|LTK5)(
zY7Gr3`{YiB=*}M6ZHRkCPK^hukb+ZI+rVa#HP)Fe<^JxW*5C4?JuyFeXyFlOdb_+#
z=o=n&z~|o!Bdb*ZpZ2~xp346JpOdY0?1acJdpkCnSt)y0viB;ZGRvM7vO<SsWv>X?
zg%H_AM~H-wRrmLGj_&*WyFcIW-@iY8k6#aeoZjcU&h=i`Yd&AkcWGz`D+D)bJ3QS&
z@buV>9dz6bCY4uy^!Xl~iWrM@CMGa~*|g4?voYkxr~081d@|`E{e`l#@+&F5`>T)b
z8A<Nhi*+%?Ek8~rAbb+}_q6UJ0YEVLjB;U5+IAV)WdDB19yy%L!E=lsW_+88(g8*O
z!5IadHhG8AuEyN-2>*DuW-FR;LHWPC3v<kF=Nc}n&;F4_9Tjyzg6(9=t;9rHG>rb@
z$!T#x36(qjC-E%_aqz28F|7f#4LPLDd{XlspOySt2ACqjceMI3tc(e=jQ()=E)VZR
zrs96&%k`y=)#(d>>J#y7ybUF4dO77biyhj#>4`qq&f9Svc)NH>BQvkx<q{F*W01Ks
zZ=h!6X5{ueYPjlrX*SgRTu*_5skQ!|zPG4bn_H%6v+nTrfPB=V4Zh?c)G}xPR$9y4
z?fhI|TWDO<*Rym>)sVSKfSQbv->7DwHOsqmxrkw5bY@KvV{98k00ke|cv%po25%CJ
z!1?U7<SP2VT0iy7<;_d6<s=Up8Qkr+U2QZLKI`N<C@3o}kRmf})ifd}Oq<2ZCWyKn
zPZ|NC1fU%7vr=M}N{#F(&%0G@wy?Dq?KvTKQ?Il1+p#eAX_RrMfB?!>%)Bhk#Ssx1
zbdLIM6ub?vi@QVORT;w-&7xkSn^Snd%;Rd<O_NT>X^<zJ8SLv9ls9#%G!YNDFq`}O
z0FD|YPw?9YFt~@GRfsYgW#WNxG3BHt``+IJCvW8sN9A|xB}d;0G#+dK*anTf@0TEZ
zJ(i!yjM5#b3nUs5xtJ2KLxrtq$svUjI7jdLGM-Iz=LnX_^pV{cCO&!JE{6({+!v!?
zWZ_;I(?DD3SbSNU;k8FzT5rPJf!$Z8Ja{TtSd5L+m2@vc-QI9g(W4CAEV^i;1eXUI
zpEPYQyesr--2s-YVxFpM)JhPEU95h=w}P^#Q~sjGBKK9o|26T&)%zvWL3JMn!`r!H
zzAbH}3|D9xVVp{5oo<TcdgQ-oXf$dKZx4H&Z?OR*6_7KxtB`*Adnuze3JYR$F}-1b
zGpG038<dHScYWF=lj%vyE2+$<m{9)ve3EM~MpUq|$kz*w-<G5eOdi<8Wtz(>P{T(t
zK8EYcbF(42P;6(b-cic@v`p{)fF)XgoT=&oKUMEjD^|?FRpq(olSH*@@?vcK(szcJ
zcTm%_a0Yw<rb@T&yP{wAYvB4>^dX{dAnlRyIVHY%Dt5P*7$+VFx`&jreXtvgvH#I#
zpV_r0n8|@``LZ6<7nP2LNdm@!9n+hpv>CCq4WnysQ6`rxyaH@A+6Cg$9adJ$o;$LN
zqN%S;G~BQBK1rHk4z-DL2cn|h6xk?iN1=4;RrW<Xy`r%NW@+tB?$LnzQ&S2;7Cwg@
z%Te4h^~HRaef-`~`N@EOEW@paK`-_Z-&fZ}csk#IqPc3X48?O<Gq=xtT+mCh%If*0
zZBi!kY2mi;;f5%Pc5L*yh2&{QPxp-xwEdg3OZDqL!o@SawEiDk*WopUy+U_J?oOVV
zS>P8Z_X@JL%@?`hv><RkycdT}gpp9G>g`ea*eYbc{d1q8c;=jwW3hXzk$;2E%BmSr
z3a89>=dhXc&R?$+HF?HJN}!r0*<}`Os%hn54?*Q?QRqpnF0yzg8PkAA(<~mwumWl5
z)iE8qDX8Ssy1(h^j8RX>c?;Em8E^Xf%Zr>j;`YRS<0kwGfjZxIyO`GBfv$YS`os6b
zt9rGzI*?l>+^}Uni@6{e;zJD*>c--k<oGw(iUB>{ei|luyPImS!sP@qtn@LNi~S}^
zE5{3_58ErFAH#Azj0dWz*_`wXo7|3*O#$<f)8DpX3(gz4#%Sqkb#pcBd6&u-57RNB
zx-9#2A&G6-{7bcy6HXjWyOh@Sz!F_rW&VD+>$ffUb0-1G($s8z#)rBupgO=PqYC7p
zr>T#{9MrjyM%1WfoMEXCdqWK=VVtsszG7#TJlwQlk3IkMQwi<!eHYk=uh<Wy%-KB@
z-&Vm~PZ@znf%S1dwWJ}cx<^le`n<g9`J{N6>DlQ+<!0eYq1nkHag_LLUq;7Nc_Ryv
zVu&q*wd+75(M^e34k#}vsslT*{FS?nluUW+!iMJ$4Ba3U1sa&VSCE=NIk1ON+DuGS
zb7!qXh+Y&2r2a#+*2G&7Odr0c?)^4_7c*B=WlL|rhu=uOn#~PLZil>*YYw%Swi>+h
z_LdaGJtv0y<eExva0i7v8DYHSptFlPtPB$JK6hk4JP`llB8G6}@rt-ur679$RD5Yt
zQ}geU#XfRaef<DG6A?9)wQW(@yj5O5rlgx1@~3)~IM}|gF7wJ6>wZyTp;j<GJvzP@
zoo2Vb#INj8R$A5cX}tnOT{?<5fi_j+X^bTG>NDT9<vEYQVrN1WG5L=Dmj{{hg+tx)
zt;|<s^C!H%zTJy0nFfVav#n#>dfIVcK>E(*++m?yp_>muG^C+zSDdouJoJU@I999M
zY*7{lnOT7jX1Qbpi_(pOKPZ?7divxn3}6#T&$P^^py7Spq+l*{r`e}JQ<)mC^W_UY
z{8grNK&hu%-~>NKf`#jQ(^WT?u-}_1^{yDPzx5AGk~`*<`b<m4w;_27Y!v;NSB+zk
znv6s~Lkjo4dP_s--SW`j%l2dDOhe{{;ZxNk9Gofx#z?)CW3ksPE<h4XJY4U4f#Lp=
z_xi4u96n&<cZ8*@EbYB^5vkT;Y5U0~elbWmP6PQV22Z7B@>cj(xFTzH<A;x-=3b;}
zt8Q^a+0(NA{U@+$=|{kLB8nCi{dtNjjdYvJjdU&-I<&JU-0}<KnAtuexjEU|#UT@G
zO}#qKLsF6zJo5R&<2*giOB=*VBc7Shq|z_-oPq?^apWNZz8T5xv0T#`5o=<NLGa}u
zS&Fj|2cO-)90E9eo_(JaxPf3)lpb&EfVK0nn?A4(3w?OO4p!wUXb3rT)bIxjF@TSJ
zk_J}ZMw|J^pK0Sw22_f1vJa~(_>md{NAe4)7{|1TM{)^Z+J(EHIDckSG`QL#p1~aF
zC)_T~I<k{Uo~Xrp;}P({J9i~$-QiV?%HGI*JSWVuH{T^HqzV6@pCU=nJ;y16{?J__
z5!#`rrymscirs_^AmG7y<R=8fL);aE1rfuJ4^;|)yQDYHh^8|YvnJ(rEX087$&zvy
z*8VM$oqJKb#UUi0|ETAIQ$Bw~XFg90$Ib2}Mv|w=c&hZL|CqzJ7qe6fGW8jg8E^AM
z3--=<Vu%n$9Ew8`L|F6dihui6^JPsNwd)Uh9;A@Ddc~;g2WhUwvr8g1IPs$e#QunG
zhd7BSZ4{R=x8(jM8tdey&pjsyE^A_Yi(T8o&G6T?!zsZdE2%wpFs|8EjccUmq6Ial
zooxvb=eTfB!^$9b{DwY2|8vtk=ICOLp4X!KdD=8YF`D#}Pp%S1mS}?}$8X%?G!>Y;
zPcOSwqf!iI;i5`EhP`0LuSXYQms-ol>FVk}Mtk`D0!++%Vrv&lf@IaB&PYabuRKte
z-=cbfl1=V)&}-;i)9kat_PO#^4*@|L=|6*-CDlG(O_hFG8h!DiY#z+rjk`&tBo@>N
z8cPQx>buh6<IK=;2Qx~Kz+=NjODksF7)=M8LY|hVqxy<h6>SU&cpBf~DiSU$wc13>
zUlB+;SNNdOP&EY|1w+>&Pe8FMlH;BxOA8ruYWfAtLp*;P{(X}MM;&)-_@(te>hv(Y
z^8fxJbsLK_zha{yoS}sWJu0~&Ss_4TVUFvO|JTsn8o6^7+hS)~Es+hdGf<EcnOWW_
zIx)6X-%zx4TTv+d3O#{husRr^2_7RHO!=o+Jw^$hUKzRwmm4heliKBtnRD5_52$+m
zjhrCVd#L8it(ScdBreRDO!fYLUNZLCw6t4q$9OcM?zu}BfmVzy>kA67-evEl`Fh;3
zp;rw4CYWRNwOr#unZ)z$N2+%Piy{RL`UNe7QwXSc!_MZG9B5r_OU)a9C6|ZTFeq0z
zby28Tsu##E;aH5`YjGJ}uk7y@%sRxUa52-xy}^KB+vU;J_tXT%&Vphe#8!S_>n2)h
zNfO=jp#pWo(KqvE=d*3PhihaaT1KLQtWw6)<}C(u)2Pno1>Tv<iJzpTWE<`8+P%&3
zxfabR{YzWp{qUE~-Fg78TV`*wdo7RKdYjw7rLfxIR=K09@{$W&Gi3WMbi!MuU^)cd
zeRb&?sY6#UwW_V#tAJg<ruI5|WO;+PT5b8kMQQY9hdA8?g@(XM*_m$Nor@#<Bey+k
zS9}h()I<-sv(hG;6y7xM8~2xULtg3q1&rrF`j!O&R@veqNGaEe(w25>B5u|ve}C%v
z%K!P8`FM_QLe5LWdt2ox#+{r)3#XeLC+N0s_z3X5lP-Iw(R%O8<bFZl<A5FR^vI;G
zPwNhCb9uLky(s8uv37yu)9Wef@mb?J>E;$*W~HKus$}Sbdd$}Ex0#@8><5f*{nFO{
zZPZki_gp}9@l(j^OYa+ow#y|K;qn0BDF6%P(T=V{)wud40}D0=W<fdQmhXxKx$Sus
zz01oCoK5;+j-Glk(OOSGSBft3{<QxBHi!ft1N#INaDw;H+d`8goj;^b4h$EAwb&|f
zE@yrn2zTn%mBS+&t^En*euaT0m**(=rKez%8|slrv=+{`)he=Pk&ZKEN4OWOSogwf
z(v5MNC^0G%iPb5dX~8?!pzLf*JD|%`ZGA#9!+b?*$INj>WZ?Jn!k4agiYr1@q+h$u
zspr76*s-&%Lf9G4MF)8~ak2B>G?2bpI|<AR;Kke+A>*Rqa*RI%727L(M)xmGCqK;4
zGy4qOT1^K;5cq4eeWcMn;nEup{-nTp`P*d{=%Hd?qU4PJxxSu0SI6&SrY@VNqq*&D
z`dW;dz~GUb6{sOCP-VibllKVJdERGkdqcC7PCwA@Ba6HMUTE0MA0~|%6&8}Jlw^;=
zyrpdbXH%wJQN9??v*~xu0_)Muy?PY0wMPbdSG4VDP3UXY&eP3pwiQOB62K^Nuje|2
z+j($aaC`na>A1yggpx;$bi!nS*CSWsgra!rRPL2u-+)tO+PE!1+Ai<9UnFV76m8h*
zVR8r#Pk!FTjURQ}&GB?uk6yMsuS-jiIjaLz4xA%Hi&}A1pavg_=~Obj>co{l0*Lce
z&34>zyf0I8`;#NL;W(7;Xsz+B;iw|uySV&N@Vr1#sXJQ?lfYp1uk=;5ZmvRa3c5s9
zunEB$cA;2&Q~ZY48Im}>o78A~nI`%JX0Xf@{mw8jA=v?W>>m`;%wP)CF}Boh<##`Z
zfV)nZy-yk$v`5ajt&T7!!2e1j)qJOPNTOJ{&(1el^G2Ua_JmjLw$Gl%b+-e}%_>*B
zVA2Q~nhUGLf~tfl*G^-CmCaoW+r<jI`GUo0S($Ph1L-yaW(H>NuvX2}H9MQnegnJ8
z{FomX4fxsiK=F~o#|-8$KgNXuHe-Z35Ek^Rs2W-V3CckAWaet(Zr*(IK(^|^KB5hL
zA46+bA7I_+>Em<;eifhXohnqduOB`CG=qn#?&%*@-Y1h@!DYQC)Un|B0?EfeR|l(6
z7OPy~Ooa8Ao-WG4MJn(@IFM-u4pi=D*4LlDag}L%KDk<HE>{-D%=s<44NV7pGKo`u
z^U)bAOKV`+y`7Ep81YukPWMja-Mrds;&$h(Is0Pt;rsL-d4yP3x|@E9@svEc;<n2Z
zY4V$EEg_+tYzo#^{j?38U&pHyM(WoO8E<+>Q8tXd7{5x%z8PQcGzG;*(_^L<UeX0>
zx{t|9{J0MmO_vy=DEBIReTYV2LH!Cd2z@c*q>-8EBv=($(zATq_fMD;g!r&OkuW)7
zM|^JdA#qcG>9XUdXqazF&q?1$W^w|l3X`uGuh4Tl4p4N)TgNCtBpNowaf*blx7-R1
zYzY#jG3Mj8p<JHJ6Cy^K3+$IGdss5TaTXHdxOyd73^pD(w$tgbE8wKNx>Ol&$C3Tt
zZ!Ioo=vlx)Xb6cxkWz*<CxqDq<$ViUHoTg1bb^46D3F}$Tc?RYI&c|Hy9L;7Y&W+k
zuESPS84%I_MS!zyNH{`T+dtbo)m~=&z9G8VSV!9!M@<?!Uu0#3v5qFg!NF7WLU;9^
z2<NW1CUSs7LQf*l<XG0jU4Q`&6$lA{Cz%xgeei9TSt5g`cm1<lMrg}Z_r|n|ng~O^
zVFLlR>%s<5&;DfYiSn<A6-4q$1+BW6qSf5tq?Tb%25K_%Zv#L8Q_5^fS9~G&J=1x<
zYw9_i0-MkV(xG?;CB`;WrQmIZj-&M7GHr@ooeepg(peyy7O+p;yY%uN7gWg?!dex1
zls_n`ENFRFui`B9xOC?p7v+9Ws+|9232HVyWfHB4q4s-NG5YMp7;j6j_&LGwiu$ZT
z)d-8!a<OLXppJRJt`Its0opUDv9XEf%ODRR=OiL+1=LWpv8&1h*DVDS6nG`A(L+^w
zp-DsELq1J6I4Kik<Ka;VYN{w1R6Z}6J_KC@UQ(OuBYR2v+R4Gp@*Z$R60^GH4-3W^
zbhG7~zANeuv>`x1PBmoL_s9h9!!_eKS-HQzn+s{9tVFr7pe9n;yLCah4Gi!~SzGBm
z9^*G6_TMbcYm6`9WMH;_{B~i(@B>+;*N@)M_OzqE&yuRT#jxci5HVv-6hnB`=%Xd8
zwl~uv>5fMnbAA@-?Y)xrYcjI-p~@`U!spb@8AF}!$RNWNtKtrekJiN^Hcdqk&*kP1
zM{mBkYL2ZEeNVK~5!=<e=Wx+uF8`cnQA}0fx03@|OHISPW(VgFo!z=1Gci<Q_eJFu
z!|O-ur*#!EACSMrC&d12s`TKnpe`RZ>R*|Aj}03y+s6JiM3Yov|BXebVloq$mtdAO
zs(xJL)U4YBxEwQO%bR0K+^F!4+pUQTB$rfJ2{$5s#X8!(sleRndOsL0;!R<<c;@S0
z&lA$=HjyZP_4B3++HDb!O7iZon=+hbfpo%q<P$I}iAZV@03XKL_d~bv0B{Gb>y%bW
zfN;KtGw5|qk$U|Mv87K%@xd&XzOFK%$SVJjJ=;eDR_;9UMLxn1pCgO=3SGy@-b-x#
zad6&l@@dv_GLm&AoS6*evB4q;nofH8cHT__YzMGj#X0i%B2-w#L%1EB0Q*=TMv@Nh
zzioF#2JAx*6{FCsi33L_YhUOs>71WD=1$lrUp_`)jt^`(I8z=p+9fr3XuJ9?)oj+p
z(?>S-4JBD|d!gE7xIJ2MIhOLqv5?(=j9(shcxV(0PX!IZaQZ*PUHoz8hSrG?gRWHJ
z^q@;3T|e*C_-&sPCj#(KV}s`2>I)wcUHm?00ylS7T?8K@q?kRikNn%$i#=t8SUU_l
z{d+#<qP+{U#t5*1B<9_Pc8|n)8D)~CDAh53h21XR9TRjaXNW22N+a;@(odRQtXXZu
zLR^C;@*~q3qw(}MI_I_kUa%ZXXU*#1TYucnTsrbFTAhyV6&o8zy);7B(!tm`ZP!`p
z2(lYoxO+`#>g!{(YxA9H+Ov{UlYme(<aTjd6Edae?UpDkuBj?-RN_F^oD&b=qH$YZ
zHGG~=eTU0He@g0H4yP$#tBmf)^pAZzQGN5|d$JBdyaAdjouyE1z}3ZgIaa5i+0a)R
z@CBoD94ml=u-%-P?D_&wA8jr;TX3OjyM63yK+?0CH*;Hm1GNju`8N<8*;LpZE%zOV
zB(G+wPv=^85p6_QSJs!pyKgqU@z>@mFL4$zT?K@Ja4E9xCd7kTrMQtby|hkTG!mw(
zquXr7uf?T>#igje>3mq~kDK4AUAx0YQ!6tgju8H-2qwnq=H9{ZOgHx%medt4Y;1Kh
za^ff0E>krqTYh{$Z?6nij1~J5OEQo!fWfFV{(L9{KnjLg?;zS4>P^=qip=~>hX*`T
ztoYdi)dY1d&u#zUPCp5@K+?3OZRl&j(7UDj?!m-@?SX71sw<fS?!I9>=iQ6*0oq6O
zxFuJA8B;xL3y=+w-1S7;AV3tdBM;IOEppe^Sgu~_(gx-+A2o7nGH$lbjiU6HHhqcv
zL|2PvZ8_QM#_63GhkKU+n*ct>)VQp?_-As0iLErekWs2n-<8c>HOtb@U}w;JBa*GH
zk-PO~kZ0gAxnJ!3S<NxGG347W5Mc2?1n?rD<(!Vs@0@v5Wose9!|+2ix33WnQm2fZ
zWD?K|{28p~zQ)J|V8x1an#>T`R+%2-bo*Xb664p)&9iy7x8ByG%Z_tx>FlwYH_ikC
z5SGj_dIo4AV8C16**OtE#wS*ruQs6ky5l*?`t97(=RzCQxCqE4H4r_WIA{!_4dq5J
z&g+%mX*Ia4^PQQAIaT$3eVyNqPX9ANtwBJW%a|fTS1i)(o35lJQIuH|R{^a`S^J}b
z4pn)sl%K~n#!J3tn8lKx_`M*G@H07Bf}=EFJka^<<4-v6_i(Id0P>t1x~zosX7dqI
zIk=tGt&nNbNJ+9zV_iZ~ICSmj?y2&eq*%SEsm5z+rBzt44*)|($PHanM)gNn*=ir@
z>Pm{=jEz-gIjEJ9gZ+fgUxa6Bq||qQ0hp+^1-zb3jaDice&c6=Jj1v>d8r0q>x<Ge
zj5svmihbndFYRYQiD`s}<M;@7gqI0!HNH!>(&eJLhppZR;DO$Ts(YnG^c1gC2Cxh=
z%Z1@E*ih(Qi?ZxzZnouVGOIIL5KnUj6xO^Wyp6~j&y{x^cK}Vf-EQP*{<HN(b<<rA
zPpnO5X755%XFN)7E_2{A(3!M;qTL9#HZSr6S;ufM+`F@j0O%%w)c_f`EV9V%!#3!v
z7hFglchU_9nNLW;^B;yF#crkFiy@h)a_Qv{*^FLzAszq3HE<F$`PMs&$0Eo~0HXvD
zt50<Irxvs^AwMPjEfE*Lm#7X@V--dTVbSa5qUQ-M5nXfbjF!lPGJKpaI6R%n;vD5>
zLXVZQQYNg5g8x<#ODE=eKs;k3E`V3Juw(_f+t!A_5BPUSl--f)y+_&tcKzqC2oV3|
z6m%r-wEnF@0>Y#q`HPF0+)!dNrpM|6j~9x;R!40(<T&*99b<<+tjQlWQ{SivGZ7j_
zh}b#`XdSJW>PEyYYkYFp--GQQCB!NzCO+hoR9qW2!+Qfet}P<Z2TXr!A*nI}?1X%>
z3Bharw?4_0jLZiufDLwmGW7I?GqpAdiOC6;IRB;A$eWdzIEiu7lv&9dUrhdgVd|=+
zV{%A`7+GLeH0}a|#KFj3^DnPtDrM}dnCbFi;rM1LkPBK*Rq!Ih$CeeDrG(K@{1J-q
zU+Jo>N60X*P`o$-AFig6&xL4ELD=F9YH|oQ1&?8bCM;gMA=1NI?{V7HHZ3(8AH5d|
z!QCtd4*h=+7N72&s2}%4YOvwE2Uj$`gaZeRoOkraj}x*L3Py5obf_8`P`_xp9T0P&
zNE8ZwfL;pAFoGT^sgr_`v+q@WEY}7e6o*0N=^8|yI*Ah+r_^Xp;aad5A}nbK_S=7=
zqRDs6?;k(9<r!3Q;KC&kF=15{h8X^QnEH$>JK1prwk6T$D3^-;jEA5BK+!q*Tbi#*
zFWft7c@J5A*g1Ik*uOvF+x)d!N<x?~Xwd+w5Nzh@=z?g7{?f3#eEM<h?1iJ2uYclA
zcqGC9#C7#=PDLUB5CJV5f5&v;b1+^fj3oM`)v)_Y;#uss!Fay^%h;YZxvGR5jSY*6
z`p>BmT(0v42p4X$r2m((y`PcDYdsp9CNNX{=hUQbO-Tb7277@b^Zy*1>{mBNeH+}^
zLY^L>ul_YPTKlfdA_Pa175^t*{eO(@jBi~Wj}UHbio-{^3*5K=guH*%QUG8Kd%-33
n|2ek*zwEe8{{Qx}fBK79I5*Cwarm__9{kf()m14|wh8?&2_;^n

literal 0
HcmV?d00001

diff --git a/docs/assets/ci_test_dependency_gpu.png b/docs/assets/ci_test_dependency_gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..68cd77ec90c689ce3e65737b270c4f149af54bd5
GIT binary patch
literal 50628
zcmZ5|1z1#F+wOpb($Wno0>V(zU6LZ*E#S~KbeDoiNq0$icXtfk9m3Ec-Ep?>`+whm
z&hgrpd#;(iS3K)k&$HHj--}>Hc_~aZQZx_<gefEaK^X)>Oay@tnx7*A-;jQIM*-Xr
z9F(QRK&2z(yC4uXNalm6sw=`li>V{N)WX$p-$Q5ocEIX}pio_U-yFD?n5=oukTCh;
zu=}unpR;d&QU4EL2^%e$6*-8hgBboi3gp+HK=T!8kat@tVcZNll9|_(M`yO@^a-;B
z0^t~ha7?8LDLVoGrhRhD3})3bx5q-sY&8*zLFynzUxIeDA${bnp_N}7QO3ySi2w7w
z2)oS?^dPPs&G6sv+7Hx0Ri4g1KG%Nu|Gf0~`viCb{J-TSETb95jUN5M77jD|Z#h-u
zRpkGDyh;aQ=yd+!VLeajzjYhN8OQy%T0>_>UoD}gHsO1Y9|XiI2rcu>lf)8e%RhBP
zxPdnef1rKG=S%F(=U(h<zVL=2{ZqeRf|_qe%KI>-0?v7I4w)wJk1+qTZv@}fkfj&C
zIE=}r%H*=)#<0s425V7PISa^of37EXQ9=NUVP8BlLO9N;Kx8ZSoiT|v4^>hY9fA%)
zi$$y$GPoCG`BY+vNp&%bMdTPVrV@X3&xdPKCOZqz+LGxOym&`FNnC=&nW<uD^tWkZ
zXqpEmF&x8eFg92PVnu<kbY_+rL@e+HT`yol+H8;v+v{&OVtGS~Mnk7_)f?>R4juC&
zuaN^c7qnEhbybVo!D`g9HXQ${ALlJCB#zWf*Q&Ukhv0h@8D^CL*Wopf&o>?|3`XEb
z+CYL{wJx-=Q2zh-C6r_jp!vWP+@ugUtA&%fRyVuD-Yk)rgoG1(g1nBugZ*n$vJ2b?
z?nCR&MN~2}f|P~zzU@N`mGYx+7-xkUPliWq(|w=puKoL?k&cNjf3Zl+lG(q;o>Qfk
zmBXmL(Da*A2ijXbP?Z90JPsuI?uHC0Bl&@g$L@jL{om4Pd!eQ4wB|%B<%n!$pet_S
zG%|koi4VcS!IYwQGUbTunSV>(_`+LhmOYs>lZcG_R)P9(>*K!#x*`^c{8HT?+O(FM
zl1m6k>}Qvi8<JDS+zuxAPWiVNWJIucbXNX4huI>(VhS*6;O94<AfIx?D_=yR^oZ<~
z_cfx_O4^4sGF_!PI~8wU%Hjfs6MJ#;s#pYr9x&0u99W8)&RmjOoph+yquqUj!3epN
z-jH<>$>^;NdlmBbUjwvjZZH@kzy9ZWVz8-b=~R*lgE%pIP!>>8DB}BA(E`q4!W1c>
zQ-(<YTePd@2u?O$tu+0%pmDXmL<Qn!eCl_uZ*a1svlDl&s{cNYU>sKDOYlA8uMHwK
zvB==^@d3~U3Va3X9<R;M#r)xo@%YC#o&tVTM;YWXc$nWp?1EM=#DPW~HDc9+H2p~a
zx=b~jraZ46(r)#_fKBVB{`-acQcb^OG{>{I>ijZ4<vFU1C9S*h*&~3V>Cs-BtUy#M
z{ac}Nc}g~ke|Cv8D#XooGTitN8}dEa1&4;>!RC^ljZMksyZuQyq6#qFHq{kcHZF>U
zWRM%dK(Owz68_HCM=)2=vgqeHg>l5+K#>)hBGpyoVTMwva0sK{B0Ou(ncU0v-Pmw^
zLdk~=m_Xxp(%%kadW2c}5fFd+>u8eJ@v9wBpv)Etu<qhGzAw1$b6nrnqM0Q73=btE
zj;Nnj7nDpisn~gmnQFLx<Jv>k&!)B8q-OCsF4mX6Lnw+k1*2UV5HP$IUV${YB=Ot9
z5o019LBObF0FjB!Xvv=Q=p8aE?2jm5=TN}TAfN-^K)Un9N}wtMf2FSf|ItR&?gw#b
zu(D|-&iK56O$EP+#WREK9YSfd-ovkY3uzu>oxd53k-f=~jnd&1gDbXWRQHP~EQAmD
zgTK-ZZh()@gpW%Fdu@4g!lhQB@7B(4QbIr4R$_hMRL&7vQ2!ENaQ+~cC~I2Dn=dl2
zYvbOlKf%CO7~*azUq_g&mM&K$<knb|QD3HQF?G+yg>Qh&h=YmN{T{ZgfAu&Kih5am
z{cvRJU;DIB^?UoY%cjhOR!%?^N1@DB{&gfG==UY%wb4UwxrW=tMV-QCI+n){v!o>b
zv8`6241QK?=I*!nCsYDx_2i-R?+!O~q=3%VN@jL<ru5OgG171Eoj5e{R8cbJ-l&A#
zx~|)1C@lUoo*0NZA;XS^lQUqi{DwI&APr|n2UC9eXX@sO3{@`iA9x9FZ%}_lJ#3A<
z3wD_!o1d~-XnX*Qr%b6^TStx%K07!wuwE!sWO$SQ`@-GlmN$+1yG;50(l>)K_|~wG
z)lItkpYTPgamP8Bd9`-T`<*{5@ugos36J!T)Ay(8JD$y?r}@|$`aT`|_vtJy%wJEP
z7ju@i9iD_|(j}*nxw;LL#WNegdqcG_Y>e>L8Ps+YgOpsWC#8&Ud_}GYH2CL+*QpUe
zC|Ng(gX=7HUQ{luTHx3qeKu&QWRb!EU!%>w%cA$3hPGA+?wx1rS$vu8T!rP?Nd1qb
zz<%D)r+hKXp9SaarnJ53RdPA;3{H`g3PdCC@)+B0Wa)bv*lvb3Uh^|S!_&Si$YL&_
zg~*!qrp9{V_t52iN#k=-VnyOdH|y0~9CO@TlCYtmh+!9cr<RN1{p-D!mFT+$Qwc1h
z_!!I|*^!e-6r5UCF9vq`0)oj#_bKv=3$}aZ*aU+8D>Q}#y`xgtt}r}e#}TmxwN{su
zv}W;Re4&jD7Npt>9{#`kmN*kiIKxUbY-8|a)i1DrD7xoJ`rO<yIaj@$O1P2x=%4V#
z%~Qe|r=jbJR((sdHX*sz=E>+mGDrDs<lx>O+3NlN!tAO$-RZlftSnJv5KgV0_faRy
zsK8ybaserBa47*kvrfg{;ver#NPnQ9ewj3l<f8Z)Rns->3%=wij#Xta%4>Y$FBd1K
z1~hrO&p^NNRP^01oFe+3fqqJViD1^ASmaJjL)W)m%?gS8iQmIyv9$7Rf5!HAuN=LA
zHj|{Wn}d$&QE*jPo$!wK-NvL%oEj!JcAe9u)iOyRYw3&F%;L)0oR$##hHActEn8<8
z{_)H5)&lE4qXk%~1>D4SUUeEdGxGEbX-hXJpNAjFB(7}ny*xF$qET0#_RnrmKA?r9
zrVwVa?#h9krJjDKe97yP|ADWaCNhX8ErE7eO;iRg{}ysM-l-PA^wJ`mJ%NL-MKW(x
z*dwz(8{=CTgwe=N+@Wa1Fltcko5Wk0f<)~Wdoi%>{!SV>OeSeyWH^}~MwX;An)38E
zzj78WBrg&k%}C4;C20#dMvD%(Zti<!;cf3ssbbvKi$K|;J=%^_giKAa8g9&_Sq0$=
zc$RdCV}tNx?57gFogAry7M$~(p>7+i%9Om9U!6*Ve&72EGTu6gzdYE@Em^Fyd5Zg_
zX!p>2|4kud*)+}Taj*vQa*W?{b(AmV8E6eL0G%LtxYfHW4ml)E_x60yxKE+=X0|3%
z=+c46+wHP}Xih*Oowv?pzKXEOX@!cAv#z@GWYzm>L-_H6r0H(#z7Oz<dA%_8yFV=4
zU{BB8Uv%--OQ$vSMxsIbUJr-4TXBE9m#PfMqYHMmK)$t?;Vv_+PeFZYx|Ij>p~Y|d
zBg=ux>swF>S-jj&Ci1wgRwSd!BYz0dMw<iG4G^G#d|~k0Kk)`IffP)T>mMIsK3Dp6
z=fdEJRf;;>{n=HdRJ^*Y_Q2uA%X`x&V}*PB`5$ViSiM<6D4=$;%J`T#;hQOF47P@|
zkil_$Tgz2{R)|L1wS~kNCrv63tuKc?jNg{t&%Nj)(rGm{C?`vI&(3Nh0{u)#(^Hx}
z?()I%YB;=G!ZQu(p3u9)ZC!XYhKvM17Ygn^KiI032Z=1KZ-g;X?1;sq{qSyZe+E+2
zSGyW6eU|Kea5LJ0A22?|gu*=(&HrbW=e)^D%(=;D11H3xWq?724ALnXDy)zwTQ-4A
z`i+o(08hH<2G@DxMfMw>bm)eh%|swDoUh!q!aD^5i+mwV+syh{M*8ZP<0D7<s$PS;
zg91^D3oD%7adkB=6~hlxp<VRp;mrCY<IUYMq4!lpn}s6$ttDAx5|A%cOl5_y%(tYE
z>@SycXPhT?8Qv8Q@uWm2L`8bsn_!m-AH+lNH+RFCDX+$AI*qu}KWdathVrvKVU$Jq
ztWVTrPYtCQ)6Jn-T=ZUNs|F1C%hRQn0-p+Wdxn{+4LLSw&2x(@`2-p1c$v;DL?h=V
z+xvea!qoOQtL180(JpKSlR9L7zR!^?7)PyGGO37FpE>!48wa1pWb(|+&i-|&tz@pE
zQ>SeTF34x1XD+h!H$c+Bv8iv&l%P=?3-u78Kf<{2NBR2ucjexG4;*+85|Zli+wq%m
zG~B>ok?Myv7j4e1X1aR+(GmDQ(UL8oQ#VehcxH6gtH!&FFb2YrAKH7$wCLP?J!skD
z^?)1fI5~5O&Eo1Q7iUoKY7RIa0w~?H_Na;KDq|lD8yj%Ox3lphUm8ZYJvI*8yl!ve
z{Y77;vs@pir?+0r)^v4s_2V7y?b`x5X?D45(yT@~(X-<h-PMsW4{N)vjrrlasCv(#
z;4$EB*p#x0^s$1?E2illuh@8Fr#E*_@cy)3wH?@aNNodQjwUC&#QQ`o@bldD3910)
zXq0fNnVA(D-&>nMer`URF-d7T>Xu15+VN$&VU$9&gV2cCCX0)Y``k8>1Mp?RX6~m`
z8_hVP74TxY@oXSPQ)>vnrmFG_Q2D}vfi>01_sz}JcLn*P;8hp;*k5L-GKbgpK1&KR
z&Yc~srS<7i+6zto7JkmlHdkAz_S&hGpkuwIQDwJ`h}?=CLH2DV86^YjyKzxvp>&0U
zi40o?D<LYXnR0y7XTU9^TfTRi@oVlTvyEqD=hkyqd<itBkM>;D%4W}c95N#hqgf6y
z^__^VR*yK#NGwzmoUHtRx9+ih2_Awb*;9@1&MeKadB;tLHnmoKTV&q8K%c0Bel;o>
zFe)l8P#7Joc$qDkryw^#9MwO>CB~rf;;WJBRCf$Ut=`JaiW3KmK@oQXn9EPGSamfn
z^2`bFxa=tmvX=+Rygs`OS0ry)p5(N*A}RITRt#+{|Hw+qZ``GJZG3h!>oDu}Dyn|{
zG~`yJUdEj;YS8HZIH8tgAH-CyTG|+L2vxnNj#H7>_@Y;(oJI7*L^m#8gL{r5CM@HJ
zY|&uhUWs#kyOEY!i)JCloj6o+q*I4L65Vn87zL$l72M`sbq{fKVlj|rw`mGspl7Ih
zX-A&FVv?BI>uyzff0&hCk&lIDG6V10Qu;(m*2ZP4r#Tt?n%jC=$oA0~NP?$8fb_Im
z@pszi9(Qc7;C&Srjeh~eS_Xz9rCO{Y<Ht_A*X2uaXpM`b&{HRj-)9aJL=R@ruRPEy
zYvXQg$hnA=B69HB)s5rk;1X<Jvk#g6!EC}xyglqwoKu5Yao^?RJ*a+Z==nQob`|2H
zL=AWv;GKTf5wtWp-sdOVR!JHLK6m#$9}kYoO3qS+54?faEHuhEKddmk0}}k`0emiH
z<C4<#Xq2U@=_YSfsO2|rHGk5<=S8^MG$$~#mLOGG4wn0SU6o547r0os;cryTTj!lJ
z3TnEx&%B6S$}5mjt^>r$oDT}~OHCKy^mPXtdA3UoQtJ-!XmhMTK0l>3xLyqPw43=J
z3Ll6oqJkVR&mU4!I0XdmFRna7J~DRQ9ybgwwJCMKPQe6jUzs=0Zustcu&rOVR@&}F
zJC6II|LQ%8<YN0cnTUz;U5lS!ie73K{(^P}fqcg1SO0xn5x6h_(aA{IUgFT|KC#yS
zxmKq5OU>}H<b<fqD}JTERRhZIQ<ZJz8EzEvN1Ea6Y}1ol<o7vzuik}OXH~&BE1NSL
zIIpf{syLNBO0$bAGmdFJ_{g4r;U&fN#no+uJ`C27aclsI{}79T?QMW{D=cOYh?-T?
zl(p?j{m@p$r4?j77E$mv`ZMD<q&iKm&jO^2!qbGfh6Y0r50_gG>-2cCC%;?>EA57E
z{jRi#R2Ajb<CbN(LlVatBys%Banwr-qvF5D4cTL1_O1i(eyUDZk46Rc7xMQ|J^*`w
zxNcMw;i}$vvbk!fySBC^>ezuEBN=hy$yMR&?S(d<9p^5D<1Ep)R!zFB^!FD^B4ym=
ziDlj=)8$|vV8Ep)_3qc2kpYLSJiF5~J0R;~jSxKPTj2X1y=;Fc^l;_QMFj%I!-4TH
z1p?U7xUk^U?*SE499We@L`4wr!0p(`HIUrvk4HuXFGm)x#@{`Ab}uY&Niz#SI&pv6
z4r>!QnPNxXz?-+C!@oXX-N%cPTzEKW5bZIuwgj4hn1?4M5k~8w8JFHKJgg>O#{KmR
zuZJE;L`kOSL}eQR1kKqqRNe}wx=QEqViV*qtH~m(XfCNj%^4N6uWe*rQ>u5(DbL7{
zNybJaidU!2ru@wR{PI4g+dSTd#<X*t6cTcAU2vw;x~!tAJn+p$(Z;ySQ)KA<qs#u+
zh(E*&?z)~?q5@U-Tdw0xKH_nC>ZyiGUO7cHRhJwKB_tMEBrd7>0fO%N0(4olK5|(k
zaqt&1=D>9#Q_v@$lGHs{M;(_3A(xjKl^q?D)bUhiu-MU<<)q0q0o+6Hz-8jsFX-P*
zb9||;g3M^6n)XYMYRtE^=Cp#0LDqc^jAVnSfm`ZaqMy8s^&b<TgEjIwUJRI<i3i-(
zBdg?d0@au&N-2g-ypQOn%VC&aC`S(G0utW`^Dh*?EpcFPD@C@&RCKP^6xR@{qi0tq
zsn+-ysfX(U2b#WMqM?BFeqb^9o^(e@Ha@YU;#%}8p2Z~Hr6EyHkUSae!?4a7H}k$q
z*>aXe!J8l4@6&ddn1`#6{yu3*uQY-p+?tDc9%6>0s_(O(`uqXz*|8RIMo1h+%XJn%
z+x#*P{&&MkgKNp5ecs)k`*fT9r`G3A5M+2WI6aFVG6%$0Dt_;%k1U|}!R@EJgPV}d
ztV5xPdsyn~pWj<}e8*|9(|RvdF6-ryPd+?%J7l&`C99*ZLI&-E9BJkAMsBVh8@L5v
z4tpG!j`k~9B$*G#d{?EwpUrp9zeqvtMm6CBr~5rQR_vq%LmMZa4-fiP$c=oslU0jN
zuJqn=aT-$`<kHeEP<N`z@Gb5dlv@}J5q^3T$NQB|tBS{R?$)Y27FA4q_S}e;YBk)c
z%*Q6D&ZjE<dS)rL-hh?__7=?Vn;TaZ3Kgu0oOzns`^B^Y%kK3@awYkK_?tn44(~1H
z!`vFm!DW_KGdq}s$pIej;tu=3>}X(9=MN_C4a2yo$jdh6taBokBXq7>J&IyY0255t
z445av)i~+#nBOaf$`!P#Yo;Vbje9iZ^y`;K2)puZYide=t!zH3wFQ=hv78q-rXxX>
zOGg=sJ`DlTUnjn#sg$21bXXhExVm%D2z1^A40I|p<+qRbLRZ??rpexN+o&zot0^g}
zI$uc2xH&x=t}i0fra5g_3}n+9A(91liZ7xdZRj=`c?RvZv*x|l6}WW_7CApiNEmUF
z!B2g=Okmhk%x(BgX&paBCKEJfE8Vm<3?k@r$c<^#pq}ReOxmfoyH41X!`2eAV`<;M
z*Y5(;*X1cR15a1|VpHXk&-w_Ny&1r;2IJLwWJ@ax@6&?6s~BJdf{~QpYaJhSQBhYT
z?u`_dvg)$7J;rvkq+|>N#lb1dTy`fJ8(JM3#43(Ya+>w~(^~GZTiu#8(sZ`^*Ei@+
zpKgy<yK-MZFSb(!dT!^Wq6NV)Muwto7SGOg-&G+{MoShu7{=e^QKFtax*g4^YT(o6
zVkeCO649H*|FpfCD#~>6qN2I9fb%Girf9mCl{wJ9VcD+TM~cXHHB%K;W*0NJtZkK_
zCW?<lGSlB{MZb*u-5b+tc^8n#W`!+EG-j9=nJRw>|G}TPum<b%)e^j;e(cGC!5V%Q
zY}gb^4Hg#s8-($6jynM&3v1UiV=>Z7)AJcItkag6QzM^xEoIK^j31%ZR1P)`3ba?C
z!ohf(_XUa|Oic_dU9b;}N^LD9vOnZyWu7h-`*#MsHU|O<qPVgJ?V`)br{0NpGPfIx
z&k{})+<}+|^^PUYjd9DvFky#>UMA<Oco;G=cZTVTNktD`j>U!FdNhLcD(@v`Y7bLF
z7&V^Cd)Jm{R)Luawe9bb4_MNbgdKLK1B~h);8=s^HPL2d#Q2=mj;~j*ueF%*VX(qI
zCaz9V-UnOo*G~??y4DphV<vOhlWb#D3#(E5vz;Ve4SsT{OwRFX=Xoe9(KV6*zSD5e
z3Oil><BhT|&|fn`E1mx}U{vbP3c!J2Wn7lk+F_E5^V1LO!oqrwdDCWy(_1C{Gmj7A
zKXR9CT%Z=WKydkBcvBYE8K%?f4M10SsMpYMzwh+yBky41u#K~uxH8=bU>91vN=)q8
zm%Co+>FRnt+}3=zyKV5}gVFlW9jd3>P^T!=3SAx=cOZHM;YJ00_;o}{`TN`q%u<x^
zxi!Dt@TThVit1_MEd3!LgGNROfTj9Q<q<)$vM`O}+bY={6p+<o#o6R&+gI^{;#wf#
z8I94L&aT`u-m!MiAR`0dnk!KB3z-6j*v6ci!(D>W3gcDDzn&;aISK?ikFyyH=r<z(
zsJg;_1o8#9O{~wu!wP^nwORTglG<VQ(O;u>155O$9RJrdGD45eH@UV~<#;o~Hx^Px
zeB{S}&Tg1>-`!i>MI^iQW_CEzNF0VJEZz?vSqmTVI~~2(Sf~DkGa{89P0PLb$4p4<
zSKgB7IX9N<MI6aN1^ZK=!vp}&fnnIO+tevR$Tyj%DgBcZKgY?1VKRA)z>Y7){)j4Y
zW_dC`*6SQ1<CcE{zc`7Wd{1D3NlG=cPeenL$6e=Sb|IOgy*RaCQ^U`}Cs^jvZdk2p
z6}}LdTYQj3pPuD*(^EG%#MNo!@cip<(a0&Q8Zx;!U{8s95t3~EK@8d`N#yP4mz)Eq
zQ%L1Z0MmwGh8Y9-Z*Tnqcn;XZ0&cxOCzZAHE)Qlw$D-T#V*#ES`<oCW%nJr=di#j>
zFd+rH(-Si?>__Fuk`d^s%PkPH4n7*Ov-ELdW>^Ce+GRyVnV$CpgqJ_!{`5j;8q<yy
z$=2{#QU+f0Yhq%M@2hoFKw-W)xLHeIReO470DWI*oe%<DpEGHgD7<NV((0Mu6Ej>{
z*;cEScJQ39F#>Q(!vQJ{YRT7cipDLTPRd3B#NxI=xQ!e@A2gC{B_X_`I6Ae5u0XUK
zOQ|m7vQAZOA_pB$j-FtT=qM_a65IAHxHH2(KNZb4dvC<`rCo1y_QjX#v<(=t{)!O=
zqk^VpIGZ15_#*@wT?Z;nUj_rvB+))Ml)r~Xul4r)$9BEf_BV+hbSi^FbxWJr7(rmk
z-F<5C>b6YbDJ*3Xrcv4oplr26{-Wyu+B0`(Cy_d)0H9~QQW3x|m57hd`fq|H2T|h&
z&ptkMNf9w=J^U`h4EyAx`v@ekD@>WZDN?EApXy7WpFDo}TW5`~(ptP+@w*=&2_GMc
zD(KU%foRYC7+(axfH&x-L=egs^5oVb=W2(kAFyp_Bd40xF4*G7Qw|8G0l0z#M_Fy=
zy92AYQIJwX{G4plFO&lkypC7r6#VuKuXqIo8Mh~5_nlF5#>q}cQxxQ3a`G?%ucOyZ
zYoQ|aBfRdf+1_U5lvG6sCjbbAk*uUU<Z^mnLNs7adx5W^<SFCV9_H3P`!*1_O7&U9
zz-6`;Eq1#Wz=Hrl!u15^cl;s!dX(RdkLj$x0xxfY-=nfJqB3Bi-VKT#xR3-SPOW(<
zyr@a+ZE|rT-*GhE*}0Uo*yyOV9`)#Cb}R)6zc~|L5Oo!^Fk<JtxD@GHM)~~dyLU`J
z2f@OPx_CO(Cbd&o@ea6X?G;y~T?v-;y?PwwJWtDf>pkvsACfaa(W`t^YaAz~lJEWW
zX)<TO+XcgpEDVl`%kz%hR@$%mo;`mz6gij;*(eUKzk00GkX}B7Mdo_9W2#S;;wq>k
z1h|q&*DB<rpFOXsG{U(JLBIg8tL|3~9B&_}QO~n1ep(HT%z<F(1$;~x9-~zuyA9g3
zNWBSnDU!Q_OXuyY!Za=GYjrUfjnXoINcq$zz{j5`K9zoDgPj`8!tFekQDxb?m_b(t
z9_r*^SW+hNUenE`dy9HeNp|%;c7z~r@cQpbeae(|J&g<xktJ@Lc=~F|`j{k30dO8k
zU6^CZ!kf=zJ2emNJoU>5KHIy#(ksL9W1*JX_j{RK%%@FigEyb9ml;HVeU3wYfy>@h
zF$6WS8|PP89oD&;>xlo$hBUeDLzej!h9PHfBAC71dmNT(R*v}O?GTQBAuf9c4_MT_
zwBs#2sUlA-5uXhOkDyMo^WbCY9dWa=9q=ot6v?l-q)7(iLEKEdAb-YJ*CfdzcpM}@
ze&E%kIfN?8|7C?P0j`UgRMFEM;3nx=3F4j_8+%r9pKUCr-^yFjb{5;SEj;i9Ujs~5
zG(E2nG9zmU{=Y1$EU{k*8$W)+YuJmyi}o#%)f-c(P8K#nJ#{d^E;_2AZU+#2m;aX|
zO{aL?krRrGoZAnr&mHs)xTt=zvq6cE493O7OT8qDM`L8wL;Elg7N|w}HnhDbfdWk~
z70$6C&qTV*linbqpVi$h7vIx;V~G4q<zI5u6jjXbUDgLb^Uu7)_48i<s9@<Nch9^!
zHxj@8KUx5FfZ)rmkmO-37Qtuum&lD_MI<tIrf1ayvmBMbQMh~3VmXLK)fW_Mab^Ho
zeXfqS4N$OL|6i*^;BCyQ!=>}Bcb%c<{}6Coun<@PLBjt~z!A8TYeqv@$hnblt4Yrj
z?$-fMw|U6`QJ1LS6ztH2_8}Qj37Qoa4+j{??XzJlhmeoK1PLU_o<dX#)T!s%Y?)l&
zy+rpSb^J8+19KA{<p7ddCY~z&J3#`$U$by=0UP8&t2fuLk!5&Id&Pe1W&*sd?e(SU
z0#&l&R@!3v`uh5A%Lm0r{W2H#vxunl=Q6xEuaVvDH)&?P&^hI-ChtSuoQ`FwgaQ`-
zf7r_n<A&Iu4At*rg=FO{LJ7M1Zd?T@lX8Jn%u$K+W%S3^iR*KJyQbwyr{IkH7JZhD
zrw`t5cy!XV@BuPM`P+=uHx|D%!niXvc@D%QxYuuUZq2_|Z_(!wFBVaK_m>;a|2rLg
zNHltnG61r(!(=^V6Q#<UP-{g?X^I*L5hNYu$U_b^TzfD=dq`Ak{vqAG8R!a)Ow#a{
zxu@>)!Kcy#(jhKa^aL<ra@SFYbP~ySh}!GL)H`l8pE`LS+ah{sr3`r((6ivE4V7}l
zD(SyI(hKC93H7EfliC8uK!ix$+==KIhsGpFmmYelX$h&A14|SNH4vz2xpdlsB#?xc
z;~VaqI>Dhe-S?VYsH*pfpmOaJ(lCfNGuvtL*do%|=rsyx&5mai$}6G7HpX#v<4FUL
zDL8-jx<S^IHPKqVDS&!+*XZU4RZqS-^USBS(*wW|kVvIoi5@gLv42`*$P0ITnLTL{
zRHng3%mnZr(uJhNQD;aKHmyjuDykWlB4D*$YBMrhf4D<?jV{R7P+D76eR6JmcAk4`
zo@MWVZ133Ctj<zoiIaM(H!F+6OWg1L;n#@wOS1VDMpZKmQScuM9KzUXj_Rx0D>VK)
z3B6~(`w3n&zShBYeC4N!aJF5pTRNfJrZzv;E1qM7-jzmt166X0q5Fx2F%Xo^P_kmi
zsx7D9G%b&t9Sse?`j!`2($y6FzWcN?&D{H?1>l$c(|uAEh*f(3)1lq3WwKTL^iv$O
zWW(AmN+s<19PY6*8B;qaTm;Bdd~Y_O;y8rq{$bj}Fp}P!X%ZxHqpE4iarp=$Z!<dM
zn)xrXVbX=bzK+?E5cC+JAJO+wxq8J+J-^1;-|^<I^WkBJIl9M!L&sk;ym^&5ei1?!
zW05Ld`12hnS`D+69==8DfYBKic9-0}#uV8n|5w54wx@Rdh&<rgz4OCVLCNSIQDtVL
zg9H~30UC+q60Ts~w~<Ag42Nd%=sJN-1O*eF>5bZ^V|!ulStOi;&r;!&-KVzl<~T5i
z!J$dZsb4_gvCXlw(eyJ@`AgtSgH4-;UQC$3F)V8suwo}>;>(gL;<mt)3|ncR4<!{`
zq0R8sHZ6aNg(S@kLjVCNxJ?9|-S;zqhn(3B3x)LrlkpR`v-b6k2=;A1;|s7?q=_uz
zut<G%5Kp{!s8Pt-@#ugB3!`PftL|ta1hpSt(obtw$YoCLNH*!ty}u8PeA8x8(vd$+
z<|K({6`5=e%m_(Eq}>!|RDK{1#wSrVLQ?eK5CEbVcgAo38+wA^%u$f%*RLH0jo!$q
z^V2jA7_n+$N}{jOmiWrs{xW~!Xw#!=m*j1dxj}eT(??7%K`h$#S99p-gNY7k;Zr4S
zx4AQazk9AL{fuSk+hT)>tD`G%iUKkf_*-7{CFpuxFoPUzGf-3C9c%Lt)m9J*uQ(N(
zIX5c{?VBt(($Z-xjJ(>sI#A^9t|;E{)o32ddnC}30H!flGoq(ZX@nqaq5K>s`tiau
zLpZr+$f$;2k|hWf(Z`7U5sp?Pk4-`{ojhQ{D*7Y2*f}K3mpYHSU&h^?`~b+=ka&By
zazs2eL+A2)=gFlA2VCAonWE*-6}&_0ouOkkHRA0X*8XV3L+WT1%{ll7H_ydR`mSE-
zzF_<ckEAjjV!RSH{PRFu+<<+Q`-gaaXRl!`HOP?VZi{oRD<>ldpK{@mQ10@`^Jh82
z!6QiQJoe649R>SUT-ymK`<nZY$Y5z2I$pXA5@_z=!yk}A$s`%mPo1Stwv}@SNJFBG
z0eQt~=%E(%mQ%j*M*$HpQ3#gc)v$FN1NP>zy#~XpAZjQ6=HI8hc4zS_wyaoK!JO{K
zJMjAW)~3%@VXQhYLWV!r@E1Ez96hh@&wII<0LQCI<`Gn<UcgV_1aJRd66Jj{A@j3(
zSu}H4@~3NZ3SxPxDfS;@)O%;?`71)iXhP^qkRhAz6=7HZNRkM~Yy0(+c4pAp3`v6F
zolbiX((9tq_Jn8Y9eUlT@kh_lKCk7MlXW^BiUFkJq0Rn|a$(Km%r?bpL677P25tP}
zYMIXs>yX`c6KhaOS%=+jgNThN_-XHL0&kn39X?3W`(^IasPKdFMR3}RP?$WGd&595
zFP>?IFw{T$#l&v9qEIRm)Fi4rLU2Kjof-SD+-rz7?oCS9?$+apkT=zvjf@PZuElJE
za(?=N0V65Bt&|E|9Pbxo)R&|`ysgbu@IOV-gjzMfc(WmdHfDdJJ9psha69r+SIy}8
zNB`%J(WXz&Nk%IL9O!83U~ow*Zyz=*!8?sqODmg5#%cAYH~ErCGRY;*RXuIhzN4tE
zo(O~+c`R$IR4d}!o%aEn`VN@7!i4k;x2EhJDz(ihbS04{QW|yIMO80N#^ArKi5?c_
zsh+Bb-fe564M9suc|r*ioY02e$G!0y7a%{6&(?EI{(*?~qhdh@RYH6Mx#B)V4abAy
z15a69TTwFV<QPrpy`eeiXMZ0hsLCf}i40pLqS0qZ*IF;D$9=m~C|^%M1skMTz=_|;
zjCjVX8=qU0r@^2x|D2&(8EKazp%^R5KVJPK;>ZmrHV85NCwI~itI&6NevC8wPJW=G
z!ZONZ19_a!8R?TdkE)JTylhq#|IQdpLJb4ZjNz%HRC&^>_P7}#d$S`$00d^w&YB!c
zK^_4Wj^UVyT5%tcGbVcY6r-l<?0kVF871yLDs_UBPX$H+v0~s6E01J)xP6J$&T2CL
z!rcvh(q?0*p=CY5s#rp&LT+?xQEr%!{F&+W%DnzQo!}zr8Az&vDft=M&SQi!{ErYe
z0DfV$dJ)YahV6<!JBb@cATtmJ+gk4{npY#q1nDyu8r8SmBtAEe8~s|yqfp##o7-M&
zdSjcbied0Qn<VLke;`0J+|WD_*8e={18IrRThBW5Und*zOt2}j59&?Q-5L%rNI>8A
zlb)c&ozDGZub{xUE@+M9{)arID&5a6il32P6y+oO+KR0{Q+A(ye5`Q&=GJeFeYRcc
zIfve~9zTpATow>AKQG7Xr-f>a1AA9k(xOr_&YQpMGiy3`DrHRrahvkytT)v3JE8a0
zI--n_t4M`>g_}X+eM*SG3r-p6yOCVP^QU^Nx~Pg09^E0)f_xX5U3l&(_RK~S)WdIK
zLE}p`SA_oDl)etVx$V%wU9@6zW7RIdeaoX~3~cAyZFU}V9yqYOBrPjcS@hKF8dm5t
zt9TWTFs*R!#RPbTSH#9bPxC)|QJebB=ZD*l^T_=TlP?fXFX}p%Yta4Rmrc7?w8!Jh
z#Pj+j(p@JNI^aihD+#*K{!}Z^ywiE*)IBJAuY<xBzQGZ;MaSkYetRa|sPn{Ode<MG
zb}+l{RH^ximshPK2<g0dGg~70cYxXtp#pSFTbP2SK)CeP3fIEYlK=H?>-kq8MU?3y
zP9S`5FaAS)GqZ$x>4S(GR^|pe%C#ZOd=mtJU4ieMf9L7vX7q;%MJD;xRQ#yl#~Mxl
zS>yDd5vkcE5wmT+jPB~k`~%N=M3~*5TpRf{g$}-{tVBvZbSsq4*$RLtUKDNlUwx)(
zgTXD#t)(8|Y(7O&rPMa@QZuMjJ!l^q{xQaJNu{6t0m!#|c<f7yJ99oG#?Q)gfoqd5
zhCQ*P8krg?VM7QT<;{(?%;hN|eVC^{fta_kkIaWutEk@b{6lt2Le59kj6QR#S+*0M
zze|$P`zoHTIo@g%t6+5`5N&n39WOcak}sDAqOTR7uzWh(cWqgw>U29?T=9({a(@dF
zDODVdWoQcct<OAYyO2mv3AvHW|8drhb`Jr>yvA|jaa|RVe16aD9!&9$qSU(xfI=^t
zI@ZfsszzEjv#_DS{rF5}x}op>vqt~z;p>Vizpdg``2*)B%YD*r<%W3+(^+XzKi};&
zXITztYi0p2hYnZpA_aua?UEjiKE|Rz*DrAR3F4xMum8MC0IdyQm!`P=nxGv}rv{<%
z(C3X9R^o-X)7t~MP0=LwF>1-Npm}xQJ_RMEGheEEKt-8PMrOslAqTR$`$=5{9PJBt
znL@z~{WNL7_y-<x`KPAQmaZ<0t{u$YnM<rr(O+8Dz|uxNxui8y#+KMk>f=t4ihn(9
z*@-VTKuk%+p-8*PIHbcJ-BT#NOZ4*M3|P=~u^xm>rO4AsOdfnYM=^HzNM;j1die+q
zR??f&acHceF1a<)z8IJ+LGJOIR?dRlJLBYaqYXI&x<fWtCSuIwmp9U3>cZ4GwY5%D
z+#H-7Ty?ch$G0dbNS$dbkF}R0xmz>6xpBaU74KF}hImG`0P)}9BS*k@y}5)>q%3IB
zJ&+oHNB1o0*ZaGNfLn`5EiUjYsp|>0d>Ssn3>#)DIefSmZ2*K^15NJk&luZztlBL?
zt8X-d>7MtMOjOR)Vnu$)o!RMDb~EKJt0>4RZH-(eP1fdXbiYMGYHsxd>IQ1N4a^0u
z+%0ugU*H}$<LsxH&7E|TQmwVXOqVK0(srV#HIjL{#?&g1CpIR>mzY>S4kV^C=~8vu
z3=$V}x9M0*?oR!`%=||k(>ZY~ldqHW9Ft@T-2|u-6H6O(N^FuvG1KzBbUB3_jr}<k
zrj`APYgw89_!%m`2xA36m()noX%yl!)M#f-k_}5%shP4CaPsA72fW6l4VgYFf0;dQ
zHEni*7OS19Z0UqjH65;wPA5&ulwLZsGd3Psd=7w@Z!`*3Nr4SPZ97rW+%&1)8obc#
zr@&ol?O1!0T^MOROig2z9$zN(@ncenbIC}CV!;=*w+o6lHGBabCgWto`NYs1*;3V4
zqkdvJl3#rHXIIWr;Z@aofW{V(`iZ?82^+9&eZM9B-JgtmBU%}_0JN#9GD5#a9}ta+
zS-<5D%uYNlBrRHg`2ZGn{;g{2!cun}ZP>74SZZx1me@@U`$Xs=BVp~5?#npN(}hx1
zlUs4I7Z&wvqVQ4lB_B+bSBKC2hb)8bQsK_dE}5yiib5a|yo7#%bdQoD_kWHvUPQG%
zkYWdBuQy)NQ%d~e9PjLn04GyH3AqJDCw49tX@=*Xzh6VyuIrM}du`>#^c_CTWQ`U}
z`Z-}Z?$BtPFjjDICWI`ZSHoA~!KOKccc|Cd4Q_}|w3zj+2u0^Lx{lq(7BHCSC{>cS
z#P~7_X+kciQ}g|Kb`g>y8)kE4+q+99(dYo@y;(h!5gY#+-&H_^q(phc$}WO^-c|Pf
z1&3_dBSg!|+IVqOQ-~TrBuCaCO3F0TnGDwsNQ5luwcH2B$xrOQmTqir6a}lvx1?+g
zAXIsNHQ5+P&r5uPd_Ri%_;PoN`DDBcnB-YiI$6`ktJjWR6jkAHXf#)`r69&Q42r)c
z9li>T0(yw7=35Wfhv0W4X55T-X56;f=1zM{J-3qAChf?$L}7%<8y2EU$`+FD>hda<
zcLS5#VqOO^lWJNj=>RmC66cmxALG&3Vbb7P!^+FV&be_6Xr-#c3_nh<%t!x&QuanJ
zZ6IbQK$fg2mU*Sla8=#|sAHVQbosjmzo^PVK(S+Q{PWGa_Gfz;tuM0f>RXXZfD1fX
zDA|;(5U0mjuxcr~Pwf6skc}Nw<JdUbK%Zo<QWA)V%hqr`9ee|1U4RM;1xY;XX6_9Q
z-bVs}`=iS>F`*nNDJ+qCEBeNDu*l+l#3pU73AwP_f>~{pyarHD?|KUGCNQ&rzKdQY
zb(c!ozE3Dw_CjQa0o32Mn2e0XApnZ!*4osJXeuk|DC(-OPRaA>W$ESokN&DDt0{t;
z^?MX&jT5@W`y_kp=RG}Nre{N7KmkL9cV>xlZz+Kj9QQ|zA`_tlMw5GCd;v-&<Aqa6
z&li0S%X4Zdc<1K%eHgAlLM~jlbGFBYU50ngWp2ZY$(_Z<c=%UpQrB?}VD825Fb(uF
zLj25gye~HwWOd<NaG07!$%>-g21E?%P7mX6bbYt0^?;b!VP<Y+{!v<9YWXl`(h2Z7
z17@OpL!Y?I`<x=cDp^?ml=?o(77!h&I@zp!qUWDi<K)v5qS$C66Rp3hmWnxI7rv2U
z-%XqstKgt-t!@So>6YBqal>j184~~)w%j8#hjfAgk=wyibJ1)_wR@Io=Af__GZ^-U
zhCQhS;AquqE&xD)J`S?IRe}8$pgSrZ&VZp|2*di^RMN6<rr)Y=%c^CCn5kJXgEDP7
z(Ub{hDYF6A{bZs9vFfpE(|4;vaBxu}RY~MwM#{H3hEPJr;rcMb{o&I+(a7R>mf~Vd
z0K@!`eCT)4iU~>8q$!l31Kf5;lf1-D^wCtw+b3q+N-cj%+-P_0rzSQ2w`G)|ZgHam
z-GcWC;GJ*u71>>S#js+_`0YB~FG>{n`FJj4p-t*jBfaV7jNdbuzS<~jL$x7*p*)rM
zfT{dEN^+O38`=~A(2Zm`7?}>%NQntiF9%Pf^G0LU!Uc3T(_4HsnejgYv24D!e!%FX
zv@D(s0A-E{I{;RU(m_sCPW>4}t5MJpDZ0Mm*U`n>CpMs@tsm=fj+9(q)53TreMF-<
zLWTd9CQrx8ZVWVBj~YbtS&Q<OmYVfL@IAYgjO&9xpc?eg2CDv<veJiLZKLELxHSX#
z@9@m7?Nnf83QJ5z2v@ChMc6@jS|kd$4g}xq&yjVTAiW>7k<a}Wh>2w@^dz`lWf9Ou
zn;6y07lDwei0?pbq$EJgD0OcN5X)L$D;kuV2Xl2EWvn$VOJ2DkFPpo%$RR^0U;wOK
zp5Hp57NzAudN_koGalL`Kh;4gsG1h76ip@S2SgGU1E#6HFw#_g`nWWJbr`xa_z5!=
z18bN&0)h-lcB_<3rtP_4#kf3Eh{Hn{;2VHMVX+As9!4DpS(<Nf+c^isnoBB*ThmUM
zKr{lxA4GS0O$QS5?KzCIjFx-zdR&Uy`cCT;oX#3xH9*;cOCzX6<?&I6uC86Oil9O_
za{{2@GC$?2M#P0emNd|R{IHF!cng4Te&kb0)NcXS{{NDcux+h<;2J$a64PRi>_|gS
zvgfXG!p1@-CrfR9Onc;~WKm2K>QHp@Lf?OLut#BL^CvTfIV6_8sFdi|-lpOu)1e^X
zo$fcCB=2<-;7X#EC--Yd-#88|B_Cjeoop_mWXWs#iW_ys8^y+Mkl}Z6Gkiq@hIN#s
z{3?tV*#8iWLrE$xyEkv=fOb{R{f<^f7BBxo*bO+<h~y57Mqv$LHQJD{mKlJk5z|VG
zbNf2W-=5o`rtPP>GPN)w@n*O_Vv`b(Q0J1k06~);29(39#ZYJo(2`oS9LXi~l7KlZ
z@2j@vr175{HAugXP>^*}9HEX|q)^>_Jl(3r0wf{B^}}3<k#nNFoP2@{<S`D_CL9~B
zj&QzsxO)IEAhyrq?pNDWjLYf-5|-t+9$;Kr>h7`wEVzT0S;6tB<dnvLJx_ItqcWU4
z$$pig>~kNvH2z*vuX?RCA)|fKFZIHN$XM*Z>$wWxblg?3@y)l(*KzRqi){?&qF9<P
z|9eHZk)Cj)nY2I@e@z?Kj+s|`kP$&|-0>L5QubtQoEr@y%o-`hP;kfRxky%>@<f%G
zmNyXFe(dqp5^wu7;CA$dFDwC+DZCLr@1SI;NF#x1Y2)djBlS3szC_HW3-}Dp70)7{
zmCe40kn=fC0PXN(C>a80=Ug2^hoX^eV3=>^WT?(-mB5P%DQUtzV+Gk^8uMNYIzV;h
z&NvrU<u{mg3y^B_%kNNVhoPC8`s!-76)&*{gK4jUxRF~C3uFQEu;Ihy)MoF$BtK7g
zYIb5$@60&ga-Jso5Y(y<=(^v0-zl0XJM2^(&ii~@wA6`9bkp1%ncGI6i5>_`C|ME=
z8mVoL7j`3T91%qs$6JYKZF3a(GVH)?06gKPrab!*FaT~hruY_NBZiGy+LX0z%_Uj<
zoExk@iYBaD3w({y%Ir9>b;p~@ZSMx%jPc!R?T4F>8VadR-w!-IOcj8bX6jNVX^ZnC
zN47>iYx2w;6+<$lXDi>@Y;aJB^n*X%jp<z8&kS+11D!9QT2>4RW#EZY(D6C$%^>=+
zLSO<Er8Wa};-)jN47(IGf(U<QibM`AcI)8V$G^cUvBN}0X0))>()Vp-f0e`8oV=fZ
zXL3WIJjRb~?H0dTmhuG|KScrhC}8avFaX^hya0X%a_{+3wiZB2-@y0l=1Lk7M(kO2
z4uInr5;<xTZ)1Up1~f<FlkxiSDX_?^m8ZPJ=iEgM5`0-jzG(mV@Ze>Iu5A@f%ywmU
zIF9r`<m;B!H%|WpZ8pGeKr;KDOP|wW0!Uvi0MTno7M(1t2p~s)3_**Q?H_tes)yc~
z$u9sbYUUxZH7~R_<4bYC7?!aFyagOYT)t4I&ms#5)#PE#&_k42z169OwAAGIpDDoq
zbWwwx?&6P$XF%oyuDnsbn0J4Ok@6;XfHh9>rAP>G{!9*B4Rg38(_zA_mno|MOg;@D
zbfE{`1(zdffGXhS0+1$VcFs*lo}yCX5*-e{DuRc3Qt+IY>z)so$X@K85dmE8d#vs9
zstv_I*TYe#=U|>^haVnEs={_QuzzW}0UgvRBO30euBXm~%rZlGYd3&HrR@jCypDMx
ze~8z(NJq$M^r~R=8xuyzw3(p)JWy=HyX%;0L1KwH8&~vl<|^T9Pj}v5stw_<!*<d8
zqS%!0*H-Q~j3_E#U#*D5OLooX!J(6Buj;DV@Y0-&SAU*Hu9hf|*#2cnyN9mA8FATx
z?MToots&2*==|qN=RNgf{PIELj6-@h+=nIySQ8d5Lp$9zQ4n35h(7+I`Xw{U8&$jT
zz2BFvf-9eYK!()$_4Ogu>D}NjyoFeIBa>5wc-N_USzbLnLbd;FYpIlgV>>CK9<Ic%
z1<nC&3q5J;){<oVyfC|+aeebj<VQR#<sx>u+qaN>=7XX^1Q4WX*syYFXW}D!#z21^
zdsgD8Mcioqdme}4_F_d))jfA*?D&(V$T2!~$|@EBWUQoOs}Cas5XY#xpMO^SsB6@(
zX;O+1hZ(YnT#Z1lLruk`@=w-Lkmend&J4t!JP-&gij*9;%Y{Miz5zk+wPJiL<oY)!
zNDdOBT6)|T?=(e3^WkZhL;P(wIQ`XU`&v!ERY}0YL(u&8=lwxJqBAarH{|73SepD)
zR{GC`^s&@$UFup<nkhlPJr<K@7x_I0P;uMlz73~_Wg27y9m>tGF7Is;y3>0H>Yqo6
zRXH`Z*=}M4?bHCf!LCwlY8ZuyhC-RTsfeu<Mafy3m!jdS$&VRpyT&EfuHNKw=2__9
z(_97V!r%2ZDPwpJ-aN1S%0|@-j<_i!)?oeD%Gl)sYvCjFnC~7}#Ku-h&1Mlnj%ddN
zP-35*kLR~`k83}DQpIMaNkM^LLOI%&m2-1(99Gs2OIsii|GHuNvpJoR?8Npv*oCYs
zfiO%^pG8IqEjI8Mi4WDS^tPZ3?<Ybs?buZNjz^~{7FGd)1E|LuUO)hviKZtSWXH5&
z@vwIO1uZLKJ1XedwS_r8IXk7Ek)kT)f%7*o+qoG2f>9&pF&|M487~1~z6S&Xj(N)G
zv5$G#a0dXe4N9ZZGtk%`23l;AsP(F4!&*R($U8)kS+#qtB+MDgpTB7P_Q~v>^fDGG
zgI=2&-(GNg?5*!90?1A?JPkKRKJc!-Junr69JX~SA}qRNnw^}|bvkBZ6&n2+L$Yf6
z$S_?DvN0?W``$QCK|;`zS$(baUw};G1y;2YyGy6~!9V~A<Zq{{KC3!AQ&Z!@M<T3P
z2KQG9@>M!CZSb~}mU}ZsxrjcBdw~erM4SFeTw3)#w?gYM9K;eoim=)oAh4afEKnmE
zByd<4G<^54J`4RUbl`krx279}DTLa5f#VtFdWf(?b}Ve+iUYV&$BEb4Lr=No)3=5-
z7?Fs${aUvkRp@4|71qzw&&}mzRvyo`sw|jYw_p9NB)}omG(atwFtbJ?spWqvUCn3L
z0}Y)fsuOQV5F{x51HCL`?KyAUb>)C#bKbiF$c_ko)F1&Qgh@o|Nw{uNe7qWV2#6wB
z{kx#?FgUb{<jEkSZ~x>P4+KiS^N#Zf3QEP;fZotGyremeo(x^pVA;hsxfuPjX2gec
z)iLyfQL_mrZtqNu+M0jvhIweOB*<5bh+cUD;(8u3P3?usbB)}6bIO~M`JB@F9kPdz
zzm$(B6+~LoPZmSJ5j&Unn798LC9TE^iyUVyfT}?(hdtqF*zYI-&Te6fnATqMA1#3W
zmZo1KaPpqKYU$HYjsrmykoznE_)^Zv+Wa}#+s58|NHgWp54GbV>6Pa>M|mU5e;VI)
z8rh#QW*z5{W$i4M$Gz|z+D$<9+c;=Uj|-LjVYENE2AOp$=*kd7TpU`aBM4QULO13r
zd(n+_Gl%^I8a*r~k$F07V$CfhamLv>hvxh^q)yoNU@1K914k5=JtIP4Wc}C1uVBxt
zba+v3D;@;l(=|>;1>{bz@Mx@S_z%8{my9P4jlIE4`q&;fi@4Zy_6n;_lM`1wn)TC?
z)EhR~wlC*Z*n$4=NkqV#z;5M#Fs@=}L}#djCYH-G^Ue#O4FnDf8>IGha8P?L1wvfg
z84jztRCr<c@rUylMs71M<@tGdlDLb*?Z2d&0K|qSaydDAh80F<!DsLOm%{%S@D&m`
zuuUvGse_SxzidTlRV@_>Q;mYmA)6p}4f@~K&db~2Qy2dXFB1fHkT`!_!l28TKWusM
zY#Fz5X%DtI#>axKmzI;IJpx2p)m=E|1@bl7q~R(Z>#%HF^24U+lywB9lp&TVIR3?2
zO5K-7@2h9&euJ=MzxJn0v&Q7v5H0VWBzt!zRe*usEx{kbWW=%lM;Sr4;MkBiDE`yY
z@L<uD@zpj`Dgb!!A#p`PQu2it^AV!FKeX~c%Fpzbaud6GJp)+YVd@|^l#1po(m2!V
zk`r2io_&OEmGwDpu6fiDQG<^eU&-T8_vnuTmbbs}z_!|*eFoK8G43u1WI3S{O95T0
zc1`zbjd1%f#tK02USbXc4p5JiD&1TQfk20jKXRa{9USka1g!ZtzZ|7Ss<R+Mo&#~O
zY|(4ms)GLx=Jgj51!z7bv6o2^@)M7lr@-lkx-LdQFM>KpzMd|MjCPhm=9B27ojSQX
zwq1T7t|DkNriQN3BU<`hCFp0mp`El<?E)JtZ+t=m5Sa9mt{0ahD&_w@iC(Yg%}w=g
z?X<}3b-gS`>NLwTYO$Sfa^D6;V!O2}s$_4lD+rXRtE8+>xx)`A0eEN_n9A}Cipp%w
z3W2M{@;S~q7wko+lH-~B7pG6gD(GdlzQ9>e!v;|&)aF!t@Td-U=Eatzp{Ee?h`7Vt
zTK)M@YQ|%4f2P59i?zoy$m<>7NN$C$a)1q~*GWLd<pLQP=2L~fbnHZ0TmgZ8(tNu|
zx>^G}EHOuj?T3bZB_M7C(0doWUmH(p5Dwrk1pqjx@8LPcPisq}*9soKk-zkP8}m1W
z5JC0JX^nxs;UhMk>r@-PpBpq8_IC~_0d`n6R<{7S`H|aHEv-}U?~&cz1OQ%~$Nj6q
z>GA*Y^%hW3MPJ)EAs`^2w6vm>l+rCAr8G)+gLL;03er6^2&i;-cR6%OgTzSJkVAfl
z-}}DbfBnCIti@Wu%$<ABoU`{n`|ju2&vn(jDf+E>a=5KyfSZ20^LftChql+$myd8l
z3mvPqMTqwKY<}<xK@!^Z-b=G)n))UmMZ<SGvlt$NLNj=gVH|DzaXN6bWV+i|)eL$T
zd`fwIOF<LkoG^<M(ykN}40G-OY~S_!lvKXb7cv2!A`aIA+unZm&(ar8Z;ik8nmM;?
zaG<uWX)|b4SJ!?pb;5G>MrqE96wNBDgMdJK!C2X`Vna&#Bss{BGSj^3XKoc`$?wk6
z1yCamu(ImXp9yGL5Q`~UXB8eO4%$P2^NA&OMePBo9(q~W@V$vW#tcE~agq`8c2zq0
zugIP0NkU0LZi~VpfU(OD#{=50olXp6k!(K7Z~Y$DJShEG#OHE+xj)4G4+KbCYscGr
z>4W>ANW}sQQ)4!G)m6Tt7w_Zt>;u|_9~3lrv8UUsdr&Dnn%F+6am7Q+?ibriH`Qh5
zu3TI@rHHt%Zhw6rDcz=~m;_Ny<0^kkiSj}!^yQnAz<4^b+6+DMmQ7Pb72aAlJD$KF
zL-aeXzE|vrS4ea{M3=q;{NaTqvo5Onl<wi!n@yDsuQ4LNE=iDZHA+16NI?;O+<XtQ
z16VI8oOb<DSGBv_5mre34s3X{h3r#`2~=ikIsg}tu=4+d04bw(>(x7jXFRO7l)Uv6
z){j)zP}5lEqZ{3dw0xc-U35VCn7gj}9yX|P2mXi0fD5yVGnXe#lSQLGh*V$$7=TF2
z4Njt<3qGGu!&9N53}>sn<Fl~+blLTd4csr%@yOxL_RNtDwa?Y&+$5qlVM6rGsr$5l
z-Cf*XO;I$f559|(T|WVwpXX$R0GI;E#l1X;$@2a!JqDusbG28=uwl(C*bR?7KoFSU
zbL5w8>6jVXv9+_JeEhIR5MAhCK{`e-NDkmGAFaWT-@LQS?{M<Xu(I9#2g^t6ukY`<
z1-$$YqMO6wlXaSVc6AV2H<V`mvRUJ;?Ue!^?N0GGk3c3YtY0><#Z9gt(&7us)!<7B
zw#4RG*3Cznx92`V5Ec}dwpO&{!i;MwcKRE2*-I4dIaA3YQ;&xh?V`slt!)oBH+O@J
zrnL96&2U+)<@Z<JJmvjL>Q&F9?wyr3XYBa1B>(hNdBT+RX}-}vOS3bR<D)YRlM8cW
zvx_6N3CszbH_N)(&daPcm(LHspNCiZu8Y^;^J6|@$d(-k$BfSCE{{%*&n->Ni}Amp
zdHfK-Ejrrslxf-kO_`e$y25<$=BsKRUm;K9vkGErTT7eLB6}Mf>#f~?*iwXv(nZPz
zz{^W)sZa3~0B!^R02mJV$+xyF_x-FRv+VyiTXngLF7GLyx#uVW;ej!RdD4H3#GT1M
zI9qEEzh$iW&!)18cEKNz3I<`<=F0>fO+wDaBRv{ae~w$aV#W^`gDR$A%ae+y0w|B}
zF_X8^Pt1!lL|Fa<tEgLW>g>5;0Z1xnAP1bO8Q?5ik(=1yy15-^ERpK=-183+OE$5&
z9Qai}+hT_4=<0vjLAvUPHL$<R_vBL&zO{TVS#oe~Lte{Z_Vi0DOxuM&V(E{Sxcd1X
zd*ReoZ!(FGTL!ft>iBpGO9cD8D1No5Ry~&-L}(rHtVkGRNUQzFZSN(4MBT0_f%cFc
zNY^aR&-v!KD5Cms<YiG8*t24E`Cs32MecFHJ|Lgt&_s2I2?%mJ0a_#A4!4t1o$en=
z@|JY_U=0k&1GNa}JN#<nPZv{1VLy`sIy>wXKayvz{}1UIBKO_F3f*6?=N?)!{Eh$^
zry)iR8mSQT1bMD_<WVJ+7Wep;>dqiqXdGjV+wn_}E}ZUWG~s}cbSj0e=7?H)Yp;X(
z6qAM}8;x!G2_5ZpN}!(9{^@N2&(@He-QXflbU6GLDFq4)#Igx*Y65<wTfRILN?Pmo
z)(jy3*uHTbo1pfK_u-58MHw%wW@@-2>vje!(#;ouaM9;#MH4GW^?N0yBL892$T(?0
z4(&4+Km$bzw2E=V+Me$<y`MU9A|z3RW*Hv8x^dpmBii8l?G)(rn_Ifzw_)`Rb#;7=
zIRCwvle{)+V2*;iMB$r$Zt1`<=)CnKXce7?wZfZ3BPvn?#txkB;{Ot@QQ#Q1NPzci
zgYl>IGO2C`uWL6O0p%vOk8gjB4siD}IhIr@y;w66kc?5Bd7j<b#{bHkjjxnk{jVa1
zEY8xy2LhEB)ZX+8f`^plKI3LxjQ6rnRZ;=6tRaz3c>u@S!w+MOSrtVH;R!7@ZTxR3
zo4?w%x7iMAZ6hEXD1A1J3XFj~2!2~<$6w-Bna0tu!OB<4sZRIqUhc)35qmt;+)aKl
z&=9JmCkgBk<rOC;5L|dBJ7?_<?$tG2sT?_2{#rDcP%PsFX+@QA-XGzL(QDqYD6VCb
z3_pKf=a3;u@;{O_6$jyIs|w$=M5R!Vw!n<P{UKmw`9Nr!l}`tIft`S7hyNeV$M0zz
zuXRG-CLvu0d4L55lY=29H=^a%!VU}@@T#t9x=Bl&$T8?(w#o3SD>HJnx-@>Q;<wez
zoV=jWXH_}p*d!dG*Qaux`9E~$7gWEa=HZWm1;OYZlYk<_vNwMxPi7!+@3B8r9q<ld
zjlEL_;25>?>x)UvdpDU^0JR}H9Fq1ugk}(4=|}`{M4GSakI1m^-G?<x1s2T}&4clO
zsn*OmLIilf-+w7Ui4I~tP(JhXzWKWx1*zEF(75Mi-~U;4xS{w7Qmo~!o8>Ksnl-Xv
zs_PZ+`u0Z*^Qju-jyW@)^yRO|WazIri%626K$}|jZ+<={*ws3^S{!*`dsk!8GXf6*
zdNlGgkU+Xj{iF1E&oNNCK|_4cWu#FV0(8!UhED3U$59dA=Pg$O;@xtHxci=)n_*W+
z1o|cD$cdw%1I6KhhWft5QTA8n>P__jUW@jK2;XCTZ-Wse|HBpWCi`{2Rq8Wo-esUw
za^JOzwZV4Zf&(;3u0dZ0OTzE|Jdc*8EWaqmhsHUq8^;)+JGr^0Bx6X7N96O+j=guY
zeu4Zg=yQGlGfbo#Z})C%w6jM^g?lJ+JvbyJBs#0#;P`U>z6o}7z|8*E)mF-k2ZfG}
zekY^mz*+6hqhs(<k!F1qlt!k)TOdGj1bU9rAeXy`hn##ZQ`X+=V?99oVxd1Y*g*fd
z_a{uF!6FjV#4h!FxoW}t5#fmEGD5!G(9hME3B|cPwSIMbA-+EoYnX^yIuepITU;#`
zhDF4t_-ZipK=eQjIE*iID^KeZM?=wNKR;3<<MnCz+!uYdld9b}Xqt{qtR3|6XI2<<
z2WwWC*M%;y3>0S0n?1xu*e;z@$f82=V5AK@Xf(eC>+t^P`{#*MQ+)S=A7yQ)itB3i
zGBQCpD0DVW!lJL?KQ#=_J%mwELO-5u=Dck2Ef=CgyKiC|&(mPPo7fI-)BUHPiU+iC
zl<r#mKx@o;hVaz1*y!A!Om9|#F)v}jt@*6MlvrZBrzPV=Yf!m+>}ud>@X4L^U>zY=
zqK~|WIMdI5h-Pfx+4FW8BaF+b4+S0Nnst6eeC84>W2Ur`aI?f3wmrQF*w3jgK1h|$
zFaPMzVlT}TG^S8ds=c~`%iOg}YJucSLkZXU1D_LdT9^HwpQTb77+1cY1P?9mLi>q{
ze#bMq`}t@(-${=Fu@@j&aJU|Des}59iW=K#>@Gj|9Rv(*L7(UMQJRHwh;_s2fsx)$
zHb2C0B+PG7`GI$%)y`STe47~&v2NRI$O@6=d5T3=n^TFu$O&j@uEO6yL8)*5E74<K
zRkmDi1~ed&L|YIuNAdTTk{`VYce*);m6_2DW1=*lx6x7yC-@_x+Yzg2FG0hOXq+>5
z{?{SW-R+v4yMx%T%}b)Z!FUlNa9s=kch5804nudoO4;C-U5}*YDQ5Zl3{*B=O7l8N
z1c3JITE82GQD#SP0-Kp;+Y!i_*jYDiQi-PPUqfN9R#cP-25&hlWV{la4AQ|27(E0z
z?|t7{bQ)f!{!u-8%S1_j#LjN#nkQ=!i}3673w_b7k-D1@U(8(W|9|ll?C5iRTBZLI
zP7y!7G_r4&eJ&Y09sXhD)(P+bA%8;Ug3|>1f0aL(5wgw!I)9-o!4vnwe%UzkT(Gv*
zS~22z8~OLA6{00Ei;G$pT%Xw`(f@B+9yb49S{}j)a1{TusA169B==rU2lO^Q>iYg!
z$`A>pYG!Y<@U6J#?mJLpWvR5kMvGmoR&o3Z>5ZP~8#WU_F$67=S+_zlIvVaHUAQ!L
zv%5mAH-0z0{;WnE+}c0-*0d0PB%oDF92+$FC7*{#0gtw>`PJ~#6?8Kf|5xiV&^aj8
zz8Sxa)G>APbSF3fVSq9PZdd1hJj`FBNPx1rxw<!(G&(hr6z~4;<m;>aFQHqrSNK0w
ziTCIVsKxf9ub2lE`N?}p&ec>uc$3wUe&vJl?#|R0^_3p1;*%zR%#mE~xhYmIRH^^f
zDzMk@7?;yJca|IU!*HbTJ<;UXj-kV~E(++X&0p{<0aQ)BzJ`tJd`^Fz3|}M0VeA51
z$8#oN;SkVdbT536;);wh;_<Y(o@;j14S!0!1T+Ggk<lWJmIJgj-Uy)0^`~U{-2`gi
zKTBn7m{VL1P3j%_HfVt4r;0-2W<3OrMBbm=Z_BU1k=8pDGo2B=PG1fKP(@J;_m;&L
z?k1j{5%V5yB>j28kXPBX(Ee^7tk3#ngoeI1t{Gxzn08c6Ld_WlT7NS0$tX}9ihkJi
z6pDphZv2m~gw&9j?-^*I%w8NbC4eCcw<%7l1BOdVilPS*Tx4aL4dj~*jLh3Wugl*3
zu6nQOP8J>@PMEJ3<pdM;yX-A-A{KQo@W;y~kDmU(FAL<$l>b|S0v*L+^`z(8ISIsf
zJZWd3g`(BK2ts|^t)IQ4)yuB>(a?o9U9<gVCgUf=nBbShmanLRMcYMeiKIPMf*pV1
zZ)P5f;^BvNx}(nq5p?2*?DyFOHpZ_Mcr>+7sMIMLS(^^@gkeshXaF6>bSlF*8q|Ea
zKCGe#Wi{|7;Ms$o*JT5{nBVKh6+f~owhm~ooxz;BG_9aqjC}{fIG~5GEE>(LC1A)C
zIBc+4L*LF#7T+zRTWy3YLdldUx)%Xv613R|#AP%c!azu_Y_&XFnBr(nO-~)(cS4Uh
z^psHDE-GuoL-}s$wX&KQW)G#~I01oI6zU$AhV<yny7l&zP)**50QwA~mlw{wA%AMm
zz~1Q~xS}>`Ss7x6485KxGVn<pk}2bRXXxo@WhSxxM7<pyd#eVI4aL9X*%6RkD4Nt>
zH?iN@6@U6KlV3-V?2t*fSyfZ{<8OT?CgqQ+j@Of(cU)AS>la)17Oo`X)bRac(v!ct
z!={2p%+(K8=MCWchC}?{rZ!N5Z(6-~b{U7E_C0rE-u5($(yr%s8*TO@2LkEGH#ft`
zBaw}(9b_$mGL3A&_>X|MI6tfc(9B+a9Y_0`mmsR}K?f=UQ`7#fP4_Hxk3YOh2wTgR
zap`;!VF#;Zn~f{>UFJ!B8o6%f*WwJ{34H;JW9zHDK1zw#U2JgPSUMl{(RXj7)h#F3
zNih}eFX}>|5Iy32T4C+-CiQKyCI*^3fBO$*>4->FpGuto?kb<PVVQ>|u6DUo_RGyz
zq4jik%Lj`MjvMqU(rVXg3%$d~&0Q-mc>XcvUvD1e5uPNy>g{cFyHQZg9wF(wYnP1f
zHACJ8h0yx0GUht<Kzxqfz8XKmih?1wwzirM7SB#8?OOMH?N`@6PcMI2VHxhk<+!nK
zbu`5YiI&ELcVRs7S+3{1fxl|&sK^k39D$?W+1-E-`5mtGb)V~v?wxtq)o-as@{U7T
z@o9Sy`!wB73u~NNyN+6ZyWMhcR3bM#qo;*vVU*f}qO&Pv>3frh$Q#HtOiVVewb`K{
z=ND>Kv%PZgO}vGtl^I*Xemws%Zq(Xu!81qi<^#Wa6GNuqa!e}24&Jjln!P)Bvye-T
z+E;f68lT#POiSZcLfQJAsutiTC+1CahM%m5SXxM;J{9__epaU}7Od*pQ!0!|z9<Q*
zn$3Hgs&yN#Rz?Jc@NF;`E$<Ux8h>8rAN-KF{Nf+apC>&|MMGybJNBJ7P4hG!7bhvs
z7jXjx1@^YhT&OUs)Z26WVvuR{tw?5%Ue5FOR_fYw7Pq;g)8w74^ks>jtywr=S?d<p
zR@ds?(bk=u5=2D4IyzeAfUuPguX<<jdtP2rwmq$T)skI-{HN2pcZ`(lrKCu+uiyIf
z7G;3MTWu?5w~?|<OXbth@imds&B;*R^0T^9d&|qYQ$D$$v4XEcIRn-<k2-0^ZmQ~0
zm+lVZ!SYG7+zWubu-!c#ph<#*+jzdxJJMl%n()U-aN7)#gFEM>>7BLu$9-QrR|deh
z5o<oplnB)>zvIc+ZHaCpr-0^lBT%=>r+3m)QpPCdA(uNLYb@LesA59}1qE}#&^F&M
zm?szl!-f*ROKNIrO=qO<TpPoC<b=<LRuC%{_G|wP&D!oZ1TY<`Z(hwOLPV`T81h3C
z;iO3#w&kh2><0d3fyx{PJvf{l<s9wcg_;g)S5Km56$fiumKbv=M7=Gm{2J&ZF!p+W
zoDjq=uX=(fnssmxZQp!4uVGN7Swb4_YN&kK`Jrv@da=X<ZR)dq^UdIhzM&>B75>5S
zz!jLDmxjnTmah<x8zr&pF``_juS<p(JIl+_^9HvI9Jj}`v_26;k`pHx?#m&RrLWb2
zZX>o3(4jh8zIkoktp(Jv+8`z^J%QRm1Iz2{>n2K~EuOGHTpR5Da2&8I<M1~vs6)XZ
z;$5ypXL@X>WYtyIGWYg=U{74~Joi1gz47&7Yqe6c&ss7hPr!$-@99A1d&OFh{tOOA
zsQ$dY#N#WPj2+gdx@8pe-RHQw;p0x*siKa)yLd``x4wQ(>#_bfn_hUtT~?3};=WVw
zWcR0?KKdq^I9S%YV(o@NtYPJsj{E-a*C~UGA}#I<t{hLVc<=p7R6w8s-g?k>);sSN
zB+jmJy^Bm*t-OwgAhA*&-Xe<ZA+aI<p7i%6(E2oP47rP4!o2T_iz6TDUp+rCj5|dj
zwlBqQPP>*tV%`R6CL)Nqn_l?x`sd|T-R)Vg&;hNm7Y+f(_ox4IsT;~}kS%=6H=FAz
zIS)*hJu;QJ85MrI8wfC#R4(7??vuQOTsvnh614@mAT{^TxZY><Z0UVBGPI>=ol5(h
zm!!JQsw`yCqU4aGnrRNDY=M?R90;;q(fUqlmn;x^;;l^iF*U}mAZ#t)4(qjDp4!V{
zOdHqSu8J8n*mhB$`yB+Fi(RiziM7Ti!-=I@ZvV|M&v`Ir#-DA|E`KXbGGJwCSS$6m
zqF`eQy!*|<GQHf`VN+{ZylSR%*p{_-u^OCax3{07qlsyN2*RSBjZYJ!6kmq~n~5zR
z_4v-yT=Wjrm9*Ke<tkl&F`U_RYdcM9Tk<`qwO{a=2XFT;HboDXJm<RsOS$ZH^(!yS
z_tsbiyYv<0a7+wxcKt!cUA%w<T8!L`AE5z(HW{wdcK5dx-I5TZ)3h#>*7o6#E+wVL
zQvI!tpP04^<JkdNRc3ylla_O_u#EWsEGCHK+9F55u<Mafd;6OX0-+ZftK8j7?unP=
zCBsODAK35s;sh@vA*$}L*5oA(M4>zKin)t5^1<jidg2%nZb=<zN^T@|m8^ZRcys8n
zaMrSVGRn&OWI)tJ(WH-Lr8uha_NbvF6Pb3`>CG{I;BlFaMyCC_ly5Jr&#9(k8@F7g
zE+|XuyFW*JA>JR>{)eW|B5{6X!PU4XqG`QSIE6~rLc7Z3hjc5RGyg>;)+E9C+wzHu
zopw&AXI<9ZK2eQr>F)HZ;itK1+d46%9oFrXidTfEzDZB^A%n+`;TuiqdjkHDLS#DH
zmW9B`@k+rVV-3=g%>PR4u6}2t(HB--_q)&^^@Y=?SDZ{5(fKYEK~pKeGxJ0RU6!x3
zRmJV8Gj_W48K70Vnm^HX#4NTdG1vL&S*Xi}Khy57E=`#k3Gn{$7@5e6w!Qn8cfO36
z4N{_=$M?=GwT=*9z2gaeTR2+Z>_;nwm26mel|*C%F>7*Kpn8fUBb>Xv6MmK1qaog+
zP%I5?I2vqZg%O)tFypY&!#Yla&}Q@6RW*m5R<icPBJA&m4S4LrOd2@LeVETavl-x;
z)zn;{m>lD?j)*S2je@EKysS(&mblxfkzGbYsJT&LCJKZXFA7k~h2^sU$Z{GY-^}@b
z6TBkUd=ArMUtl1Ji8^k}s4;VFNjGn}!H`*KZRkM#++C=jVS@%MiyAJ1v$rF5yYSX6
zpOR<_f{&!Dt-Jsd0I~fHHYuY9ew&-2&DTNP;S_gAD@Pb^`@=%s6<;pi30OkoBcr4|
z<FdLCN2bA-@rUx9lVelP;Bbz!-61iD{Myz~cxNJytB@OfcOfyYuI&4|)7Obeb&;KY
zY9{6qI<iV#kL3=)2$$N8=eJ#r>9(y9a*^gjrp&zN=T>z>hH*y}a2G)JNFA{iNy>$f
zCbT|V1i)dI=g+ox)#%t*@22UFH=O8U{XPfi)c?GK%dZbBI>%nWtO#oH4GM`SVK77{
zJ3S98Sf7*PFSOnvpcctaAN9IHt(p=o{bLLT@*m~8Gh1iS1a;!XgqH&dsUIcT4K;Q4
zoC672Et+p~$jPwSg?;}RSB3kMx^S!#QZbhMh}%woL^1R{Tfuayj==d54o|%Ci#WVP
z>X=DaG#Aw94gLcvvOG1lM9_1B-}uxbv1yR+hm%i>7PI+M&Z&i)Us)UkNXiH`dTFw!
z*Ok33nL6O$cWT`vgnf12A9lUUe>S^0MwI8^+E$Z$EHP#V#f5n88+%*S&CF@<MP)P{
z7oWZQw;ZkoY~^+0ztPyWfpTG9+sd(uppYo|Kic|_>RSZgso(cb&E+yG!r`>S9-A|p
z$(0c0S|L6$zPomCZN}>0{HoO-mAW%~ryF~(9@iNG*2CUHMjGVrD0tfnY7&_J>(5!s
z6)xh8q}t786(;0;iBBb%3J&mCu}bTD+4_Rz76XC?BTKMUz(9d=<SvHfS1AR!ROsfX
z8dR8j2T#IkFxBjCQE8>BbZ4yHX+#{~^RhCs0x$wUDFoih;~M!BKE+D5Pf1Q#1=-kd
z=Gq9~#pjdCeYw$qllk<2xB#5M6n$DZRhxdyTf6Ts8u(KVP^Ql3u+)h72IUQBM%T0f
z+ZwW^cHuO2u4gArf_|EO(S^vb2CR&QYji-w#1DSYt})41SE+lk8LGC%f(QFb%^nR4
zg045|>Pv0ID=d1fcDDD*ic5*E{<`_8avabm09B9QYq<5kliBoreF;P~28K#YqIJ>6
zfzNVh!(gaa_%sYj?|PsYjOD@=qM!Fz%Z0BT&;_)mF~<9yRC&m@;cHJjox+FoeF}Wu
z3JyB0{Hu*R8@7853-$dNQS{o~L^HD}{Vm?w1s$t<y|DK=8F8lQ{5;1?i(D$UJwnF{
z|MZ?4_`Sa@8YpY&ZIHNC%Kd87O|RvZn}%1iP;Df?cV54cgW@LXmfo+mL(Vw1Xg6!L
z$cMlR-TK25twd?YvruDX3T}@tx+MQwNoO5bTl66(Bo=<nhepRD6ww+>=qp!q&1sJJ
zcQw~sT2NSTwJGhOS#`o=9;O(emOnVy-&-B7|BB(yXbhg}oo&+_XHC%@azKmz2JZ}s
zq5`8fhPt2QrwcpZb{BAkL}l`OVoqiEG)H{wZt;s#>vRHx`!u<}yU1aez|c{iTU2*<
zGK;H{sbagkZlknpI9XqHjUMvo*Q?&v&<S<5>3ShP$iXez!ls@BInSgTHxT2=1d*aR
z2a`|gn-+~XbYT&!J_kRcP_fdZfA?1XkG9=?a7fHQjfhH7NlQv$sWp&Sj;vMJ2Rl^k
zY2TMI%$+?(`bi4xDo)r9YVTeyu{HY81Fo4tiSf4qW*YwHfjBiCd%4%B=1+E7d*%`v
z3X%42zQMU+z+Dm|nK;L=VQ`bez^*<=os)0IKKjsbhU_O$%Ar4C*kSK~A2QWyGHbHi
zaO!|a*kyaMic%k`x7+2kGWHrCs%G&mPplmTyYLBV;T~4|z0C+)!oFwBsca3p0eyo8
z>}j7(z+xHOStU*K)x>0VKf^puIhK;FRB?L0>%cDprjuOd`|IG<58XzjE<3D>M7u$J
z^WQ7K>uS-`2|&G>OF9a<W|I~*JMrEIJW_vaQTZ89y^hb8RU<jj<L~_4OzH1*FM&cF
zjcG7T<Wv+!4V*@vci(qm&SYBb@>L5zE8h<yjF1-<of97UN%cWgX^KF<6po{m5&K(u
zL6qgL{aJiZFy$eb047}Tiu#k8P&Fd$py(WGb^`pTNpR7yVKE2U@x`4}EL|D{Evcbd
z+#Z#R&<XR3*<%EOq9vX@=lUs@!yrXQEPk+qG3VCjZHgl69WD032a**Qa@XMdeX~Yy
z&l^1qe1cKxuy*Uz)D?NPMy(hfopbbw8<x?c93jWi`lY)qhQ8aus4QB&<MyN-p@s3!
zOu#SA3VMu19?kLB!IRj!rE3pn_yJcA9RsR@8ZsXuikj57hI@hjD0^DM+&zGr?lx;V
z+i~|!Be+5r1lixS;Z3gCFkfQdGYQ9z*fqwHblMnv6Q~-u_df|oyW0S$+kg+N{lTs%
z7w#PD)*)EQ$Aj!pAk`^VATws8Cppm?6apeNY+l2^ueZ-k00tzocak>P_Xp-fLVoor
z?nb?z-%LjIAH_F0AA>-@Krb)cq^q8o1mV1E&#od1bM*F~wHEU!=H$5hTdz&&O`4}}
z!A&cwPM+v=t;WkbBIdvZW*_2j6qpU1{r+kj|G?Y)**qRT0i9|V2n4~tkVk#xFMKlk
z{+EvQ=_3;yyXHFxRhEn%OLrt1>q*8)-W0`=0mxs6a^lM7f$ILcb<416UJBhw1_wPi
z-L|}{p}Bdmd7xmtUg@7S8~mjCjZ-Y9R0T4ay^mUC(LadLj%?XjfaCDi6j!csRl{h$
zw|aiYt5bF47o&GSY1J9L|CVU|UG8E2{(4T{jgRH+(4SDQjtUf5is84MAbqht#3fkC
zsr`>u3QmdoyuT7zxE=h_w95Q#<X>rQebSDf+IQg>VJ}<W$zvAm2NofjMS3K+d{l~A
zbv@Ok5izg*?xRW)3M}`*z^^PKX-b%axmuLEioPZ9I&Qgi8qCraE4e|6P=gm2w>%v`
z7)^eZ*3e;UVc$h(PqAvqsB%U@J!H%{1f96w7(-tDq%h4Ezi83Qz0vWf@S8(61=hP2
z##q;7zs#UoZMij6R;AqJ6bh2R!Wmieg-WyAh%L6#k7e4w4ns8+8wZ&MH*=NC8en_Q
z{35(Sy~|yjS}}a_nP57p5Y_j$q)eRW>N?bl;PlRo%8?6u#}{a#VcDC8jTAFl?APnV
z{b8?__PKwY(_=yNG)5vK@bDH)8h~_{vcc1QN?UddbM9KkF2v&@C*mMS#fE0TcP>eN
zD+s}K8veHdHtOwgsaJswzF?--av%3Uec+U;poA_|E=-Tw@a7{5vwAKXjhJnDP65s<
zKh}1{Y+-}HGX-Dmggyj?O@8y|f+*^qz8IP?7^fTa-!m0E4)_s9^R551(iM6vg(rnL
zKC6Kv?n7)83}_C@kONWgq}znn?>$1@CQeunKLL<$z*ptoA>h0?RN)5aP-l6o?-XXv
zEXc%)A3sbXaLbw<DH7lrsP_HszgIEOGPL9>sMgZ4gDs7kqepxDG0069|Ds@2>Y!fN
z#wUrfnN0Wm!`5V1M_>5fUZ`f6SV6TDzOnKbV#WIaiz((5=i4$8UjC*X0aqqjeSlG#
zidGZ{H+Jpj#b5aTww8s?ZzHJ-kJSMG(=$cozlF@@+gg5834xRWXj;V}-wmf_H??JB
zLSZ#PieG<T5eX7DuSaV~Jc7K95mu(FKx=n06ktNZY1e*zi0_Y9q=RaR-1hSe{b21t
zN-V32nxDkTO#A>H<Bb}sp+GqmF$%_aef$Fyrt(G-6lmH0MpDYlcd}Dp)TR2Zl37dL
z7T-eU?0?bmD$`96_s0;h0Xeld3Zk+EOfl3rkD+3>4eTufzaJCDFUgDBe1H3kIp!HH
zMLAU}*4u!@kN)M?6(iMKJgQHPjmOiNzu^)0b7Q<&mqdqh81{{=W>^}qy8Q6HQKV`p
zU*P*NW;191gaqP`{YwP*!(C~zp{(WxEps{5^X^|U=3_YB7j7Pjr$75E-^F6XXm36e
zrA_|4F#Q&)^`7*vwldu@2^bxorYHyONWy$&pkScDDj2??FD6y>;q|2o$4jC|3W^Fx
zoy(6#uo_|(x6ZJVT^0kegC$l5Ql9!?i4td?s8~h|rNVPmD|5so<JBhkLnkJ<Y+{@n
zTdatm+5P$TZ=ZjtTs&{SX_Jxir4zsF{V0$d?-nN6=eQ*02XiI}$O6%dBmy2GNB!YP
zTEh<{#o2`=W&4lqIOb|Z+8<0Hys(c#o+oGAjI21#Q<Z|t<4N}&oFa!v=bhr;c=~ca
zv~Z5T%}Pmmt+R)pWRZeHCC^Em8Ih26=*!5+_=!jxT)STA&Fkk^1^xrZrz$RSW^XzX
zA!4Y|zB}@GNG{zACWlk7h!&UbltzQHWXsFl$IY<4v2DEkVe&_8H?G-QQMg~n*oJri
z@s+GmvL6Z`I5ct#`fA%fd!cHGbYKjF9AtTBo+Gc<3bB*=>iF&0M5ckB5Gxsu((@;+
z9(XTRMFx+XXN|d%2Vo!oe%G=ithy#3$Po$-6doy>q^Rnt{Z=SI__Rt_B*OS=!`>q^
z4-K`ao(0*7YJ?;Z``FO@G2z_@WXdxJb7n37&*A}t8g!DP-1$j;9I6jqTfY2s{m=FK
zrzCkb;`PhY)V@83o||7k=%FWpBm3TwUJIch%P1FT4}+5yDTw~+X4@M$$&gu$b-7uk
z9AiZz;V^EP-iVLvx5X!us#<l4!;QbQF%}9PP^^Cu&5!F?)0rt67Da6Nj&+(+9<-*c
z<sPaR{?HH_cqeToK^RoEOTbydO8Y6o%Ec=>`gzMK_Bd-42(^~dkF~m;#H8?K&CJ7x
zb_l+qk0klMhA~F`S*Ps?e8$z?&6=$E5Cx~$`oHYD&&V?dS4jakqdt*Q#rK1uCqLC0
zlDTPu`d#8c!PZvFR!Rz}7}Aj;z@{FZ+7z~NGz&_`${uk>W1L0G9!}cdi1c`PdN2hr
ztJFVo^d+aHrmogMmdfQPicmKRe8$c_O)V-SB3UIK0_%Dp;5hA{45U>xM6%6h3_S?V
z{AIS@F+Xy8MoV%CtOMgA6j#Pf=5YYN5>BE`W>#iO$KsuQ41LA~s5zphx;C~vf&rf}
z;!(C*`|Z`BZtu`2B5RT$ozM?zn#xS_T)Tmiqz@F#voai>(Zq6!h!Ei`jmwYzbNK4`
z!9QO=3jSVn2%ITCUd@B5lY9SbO6t0vdgj@&FfBgr5Sy?V4-ZME@8-Hl#~Mkug3uQ@
zf!~if7H&HTot_(;q`Z7%CHj#+T1vs&U?0nN1V8IWdBcXMKB#l23GoPNv!Oj(Y1>M9
z3h`$^pY6^gtfwb`Tp`TDN%Gwmf7+*Hs6wPhc%(_BRkr#CyGqN##<|k)Dm1+>s-S?m
zEX&E$2m3t%D_JkbxBSmXBU@XuDsEMI;4w&KjX5Kt#Q18cUT4jl5b!xAl^h}s{xN=@
zB&;NXRlg~2Aq|)M)!(zU;_SqoZ0ygWBbo2+`V^e~%|UXymzkMzbXk!9&%F~td!t7G
z$+B8nXQ7y_^WOiTtKuhta4Y-Y{;x?xxSa%n(0uwmkp1ahgvXlHZKC%@$@Jm5Zr7a?
zm%%Zdq}O+c=lY5RvEhWYo&LTyXy@YQl`59jCDw+jhv?Tpx@W;73;RDM!)q)qOlXgz
z4MNa_o#`Q>1X&846j3>|vQuYMS1Qj({tdp&-6lpLZ-TA|BkJ{Ou*(v19ta~o`N0Ge
z!VQ+Y>U39CM4aY~onLG!$k6ruOn)#daI56<S&n2iXNZPbP#8iPCqX_(D!1O_dtJ4o
zVxH9+%OS+F&mn`_`Ro>^cl{wv8BtrjMtj3@hqU+13g7S)Ba!9w;qcdK_M}<dA5l62
zBebxWboV?d@)_0<4tkGWVY}-SXT{F^7sB3scHz1iH&^u{FFy}b+eY>CuA~ncQVNf>
zzRk!(UX@8<dw6pFF?DhpH+J@rO0@4pFp5s&F+Uz|@nc{MW(Ag_jX;a_M|)cIUzUQ@
zTGE5vtx1{Y-kmu=(D)rbcoN_qI}$Yg+z@i{!L$;@w&SHJy0DEWewmH<r<HAIkMBA$
zTbp=|7x+pf7v_-Ow=Ek_vS^y2b=#xNg9w?nFxK0wEQy3gFTAkh6EfiuEK3S^ns6Pn
zBcVu^(fY)tLdhhXmSFc(l`o38t9`YfJwmW=h^Js<BeCo*xnM~TTKo^sHu98J`O3Ps
zPCR@u;{`g%ZHUp$r+j9gIF6atzN1LD(4uL$4j23WNYsz|`AJ%IPLmko(>i^GucXnE
zAK{IU6;ay`&u!J-wHjYeR+&x53-v;tV?38UOJ1jN+iAM5phYtp5=hKngATv^sgR%G
zf7gAAFda`nCVk(PYaq4dx#O7dFRxTmRHEP|KCBK0+hgz9q_{J}`Q#mM*;*gg5#9T=
zE~!UEM#y*}JneYBLZJ32zrL$V1-52r97a7Ualg<1gfQfSP3f!o8BZzNH_IcXG8wrS
zBWrJY+7xy!0-)ar7#wHE`js*_ij;Cu>_vt{zI50LkMNPuXDR868-XWrwL$rq=#!X!
z_V0^T(YUw1H>2kbdF8e)z$Wh*?V>`qC-bEBm^*AGQYNP9O#VeJ6SU1sEKGNdOaON5
zbK^;T8<h0r^-Q&o{L`t25p6#kkS3v3^s6~zrcqsp3cm#ZB)&E6H}5e^{}Mt8T)&?*
zBR6$%@)(dC7_lW@nH!M|Rbw{Umb_>5ZGrH#KiPktAT+!lNQI}#d1PjVz>g+`MZb+-
zuVRY+4ey?I!tR0k!>i(?JL{5i+q3RS!)-WI<W_rFK|a{?MWt|5o_w71A-kmfqXA}+
zqZWU>->wWjsi3(lef2tgVQ?WxG^>+LPg~)yN3ijT(MXm8iOhpG@`|~7t>nqOhq?+x
z{ZVQGvXbW=ZO?j%pc`#n$HLpNp2(ScHVw;!{rxR2t>TJJP-ijPBg=O8G!MSp<4uTw
z++yBb?N>SD&8e{3H*_b_guWoT+i=eQpbz=7;hAcV84_&kX0c1<K1mYUhrU|b=`*Vp
z18<PPsLP|N6MQ#0(@OFabs9EA(Yj7o_O#`BtdPDG>UxXt2&KF^(7mcV`ilcyhOn^<
zYuUOHPffnCF7hyvd@+}x*?Q&b;+~B|Lx)bU@BPiAw;;)0yAMpVIw9FM#oI{1`f16>
zo)SWJkzvVjRx&rwYt`(ZdiIX2V~1uS4Krr{>!eKA^<QZiO$gkQpU<u}${1<CFHV{w
zm0K4qv#$Cyl_1jZHe7OKL;|g@BD~{zQ$xkOjMK8YL~`|X_|G2aCx@2iO66p2TQkk<
zMDid{iW2KLFDS<-6)IRQ8aRiX^1aiZu)qHNJo1>pH!)9GLZ_5+@mU!B;+G%a@^ZxY
z%wvYTar1=RX&MPui{x9>f*$K=Gf#hP>H0K&A>7LA^UiL&>>X`h;qBwxisjf$-0|Kh
z8FEW$6?Ewpi+oF`w$Y^5<6n2+h1ey1GCgNPx*bm03Yq4~+jEVNX#&p?q#s*)U&bY9
z-i946d{u;&?$gv!4uE9z;TEp}<<&3O{HmZ08Mq14;p7XP>$H-3;{1;zY||7ypg=@f
z!XDwA)YnG*3i^K}K=&4f``Q2D-iit<#1B0%;80_EhMO)?${g~{Z~5EJ6XmQnfcHkN
zJwGHGYsoH{HLi)b(%vK-bTTR-uj=`Y`4#y)8LnpeaB1}L+pyug<FSU#A8$GyOI3+N
z=$v*8aQl&ayw?+^(yXE84WCBt;8qE^3-2PZR&BLI=F%~s=Vy4yY_*!u5u-PbZltEj
z4{4(%Ukh|=PVw}!+7TS?&y+e5acGlVk!l_zJo$OoU-33K92m_TR7<py;SfKZPdDf;
zfZA*dwBJ54N&ooMdpj7J-=t^BQ1IE(#rsPvk(rE29AWvw&bp_5DY=Z&%6n*Z(Lbn1
z_<coT$_uyGb5~Mni4g>P1a@%D_jc?5_A_p}qrq2DB1}1{3aYMg+ci)90#%)2i8WjP
zFQXT*p(o$NG>07}0{&)y9(ycc)ad*{$NR@WwvYQ`FEuKKzi~XN&}E_gvyJC?UpPh~
zig+~0E)+Z(xKn1<=Me784a>_@=DYsoYKfu&A)<LY++^F&hz*rtSbRA=9IB*Zvj-<{
zjI9=NrKahN8V}lmOJum}7M!U`h`+d_b|Enr*pG%F1Y0W~*Hq(TP~56z9~eU6YlWd-
zwQjd>>IoN~ZG{cKxJKGqa#P(BO~c6Kx!55O!@K3ox?;Jr7K6r{AHzsYIyt`)-;Yk^
zC;r-kOb>JKOHNNa$Xg4y{wd4?w5i*m*HY~draPIHQHW(fp#nSk_KRip%~vuCdkpB>
zfL^Ee>xZcIZOs)A#RH9kJXUIN8B8a{yadp&pTN-TC#XlB67||5nOWZ4UCo~6Zwqi$
z5kuF0s*L((m_LAl9aw=VUj#D0n|zk<g_I?GU+BLlYwejyYa^A}CUfc(Y(_K*lk^}-
z=$`H1u`$YA(k(E9H3~pYZ20uM7ms5Za2|PpE+YaqLvQxjdI$RWzt1I9S(n}LXlfKO
zzN`jLhrYet@qX?YrqybgDmEYWV%xr#<VF@1zGxA<A;C&ZNyb3?YZ!ROs|Pe6NGQ@^
zN$W+WT(dHt1mlPH=+jQVK#wFR_=iZk*z&yCH+Zx9ZABdmH=N+IRe!AH-qbvPMBI;t
z;SZn@W9Wt7cj|G??Du-^__I4|Li8x)d*WhQS|i?HC<L@u7rqAbfk1Z7Bi;E;bH5|9
zf`n1o-#shx<Lf(~tA-k4<U`56q)vqF8tPO1kiJq4(wzR~w<fcrS@mr`K94iJ%%B$e
zRbu|y<sh@DlWBDF_PScDtpI@6<GN2Th81OAhV7N~(YrH#Zp1!@#pan4^Oy)XWuD_M
zA_4rYp8G+z<+`LVR)?I5Ec8bK|2Q+9JSk@i8@R`+n{ImxLH=@W=FE94^@b}5lhIDQ
z50{O!UuF=2F(Eu2S?cSFt3QjUw=_*IH||uayWnnD&QIo3lqeWJY#N=2VZiipl#wjr
z#Q=&+Op%(bXxXcaX3@(T6@yXEKT=o|<<wczwEbcZ{nkp{k4AU$+;@M!)*7$vRgA;=
zL1FVQdk`j;y!~n5i<r7cW0N~wO3Y((0b+3f$PFvg9P|OnY0>$yw!oX!t6R#HQ(!M*
zeoyfzAhMf(hCGQfIw8-YMU2(?lD_5)Z<Kat=SRP6xo1g{d#o2V_XU({+E5D3F+4a}
z3|(O<wO!LL=FHs3fhs9U31;3M(t+o-ZYWIh`jfRF1-91+zDgY9rOA9qSeu;>t>O|P
z+1mDVBm&!#n@C}7TeWAYzM^!b$n#>9ps9A}8pjr3@XS7UH9q?~mTvr+gE!wf&q@l&
zImEeseeBblFfUbuq;ZD$g>x>j9@tKz2g3MF*Mwff;^nw#WbZCq^S#js_}k?=cd)uk
zNvf=<IZl!|e(Fc<MRid>oh|*NMaW8@%QZ`;%f+s@w<pi}5j@=CgJXq~k;akFPn`6!
z)_`p4CZ`<y+0Epcxp7vF<;myz+A=<*O;w@rN+VB`58gI3L{}&<qMfP*fZV0PoU&=X
z+@@kV^d+h0yzrLQNwZz|v_498y(+!Fq-pM%U7+H80!!1G_wh3to-rkl?DI1hvHsU`
zr(Jy!F}j{f2<w5WIN-LRu)li0)Q0&jg$E?4N23C{I7pi&%%LE%X_({ZYcc(IQZe3N
zj=VrF`mbgULuXeybK?{<cgZqLj~{9-Few!oay}BDAsf<Yg*JvCel{nW|NQoRV@&C&
z(VoX3M-@Xqc5f2DIu)lNY3(U;#Qtrnm<L;GsD8EBo2omL$yjlW%_m6BU8mRbkJ%z<
zJ*DSM#e2SW{OQ-rY>gE=Vzwg-)r+Rqdjm<Of1;?AJLR3(`b4CouS`;b=j!XHpdWio
z?M{{H@J{`(r_;%f=M8@>!}Y2b_Xr{iPuB5{F05)gSSTlc_f}g6O|9ikd>*FgYwDTF
z99iUE%J|z#W6*9|5i7|nSnI4&+O;L;Ah)14+l}~Ome`J{R}zR934lKxLNxdeKCewr
zb}<@BBv*=`C49QY>m8PZmZ2w7h;3_{KUpk>6nmo*cKy%u^b7A>g9=@6Z*FJ1lRUb;
zsNW{m{w-P-7O?3QwI(RoWSsj_{5d&&+nT@$X1uk;4&tKwywff^x)N~N%oxcZ6*6+i
zW?vl{_ud>~=9W_|3Zupy;Xl~C?ReJwMC;>ok=EIoh1W2XoMVp`nlert-!tObThl-E
zOLumZrKSds`|$?32rTA!Jx6wgiIesF@hR#W%+~1*H3iF1+c2T1jP#fIsqktBA^mj{
zHU^XGFh1F-lwF22O02UBtJ!4IAlkcEloP+6Y<MN{*iCp*<xG>1FAVVdCTaZ9XDNDt
z?1^GhJjZ)PLM3|Eli315;orgVRnAHyB)|aN;p=3XqD5TN&{dVsCtveM_J7Z&B|#1s
zdg5k6Gw_r0)}5q=mu|cfXL?MH5%lS!6gWy*H^w7N00~Ofo*#Il(1)Y+Aw+sum+2O~
z3LscKSYO{K3MaKHp128<uX$$lVa{Y6d%p(&#tY!Uh}=kP6M!+UG!y%IDsmK4yA%}1
zRjcZ<viE58V45_P_mMFHoCr}<_Pr|a0HAB5y-{%OSjc~t)W19rnuYTYQxJP^L_JNf
zl>FSw_YrJ0qcyPO`{D}K(?oaKMk+<JS^8JxkGAoo3v9YAYsrU5Lnfp!(8Tfg40(A(
zh8)uuS9Z5{^3YsN<4d=!%0{M_#;n}=2mD^dV%s0lJ=r09gxQz4JzPoy#Gm8D3p=^E
zxB^f_-V_2c0;-T&=C0Qdg2aQ()Eb}keP!is%F3BBR8G%ule5rpFI%!iTwZHdJ9vz*
zol2`uK4I8;s_?JY4Lg;c_TiVVm6VheWu|w8kMWh}s1<9ZKTJFl4ad6UVom$*PxuN0
z(GUk;Iea07!BE`@t#yl^rFdHX?TkCEdTuwY!m+N8l5Z$>e5675%ib!w^Lgd=C|hPT
z{;P=7^pq4MP2M-FO5YaD-CbPIy_!V>orBG4yn@v2>7*IuplgSsVPiT}l}8T?KS^LU
zjBQhV+V-ugBAvo4B43{=VZa^u3Kgtu(fvb(HDA^8kF8KYXkHraV-$CMI)elM;u1jI
z9|p<EzP^V97aNcOs7veOp`%_(j0fTixzf1VZPx%=rpOrEcbMkkzK14<5CCkIYxG_e
z{cTetXGZ@*d-uE6J|HFl45-E{e%P|NLSS(ToJ*l(PrXS^AX~&^Q`1)s7avl5=%Rls
zKKrc`oa+zY-tZ7m)0_PA$FN4IP1yZtw}+6B6(IOAHk11fAWhw0$Otzft<vQcgaRG`
zGjS@V&fM`C$pzD-o70(bXuZCuLHRghyQ>f9lmSQ})a&ZkIJ?u73vaLJ4}nxR!dyWA
z(~xlutPebY<7SUKgF3D>bIEV^o37|DDhistJ<849?N}E#aHU7lEDY6NEo0oJshhMh
z?**G)W-JCtTC%5rXVqMy^ELtPRXn`tvy-#6suo{#{=$yzmEqD%d^rtwrR98MU3+dT
zmrL+k|B@~$ZQ&CJ{x2BvOf1TJI@1&LWq&pYN$0-+y~sp4!KV4Sm+LX$k_7mX{e7nN
z9bfD<|Az~JSZYO;vvBv8vGA5*M9{`aA7=fPzDVFq9EP8roSdDX61bh8#O_A0^txi^
z&Kgf_o>V>&5S^NxnwXeZ3Opr{?fXRJML(DAW9gCb1vDQ$`BC{~_FFT=A-|}JEU#R0
z{N>iJTjQ4sCvk>Y3jSCwt}k$^zC;am9^N_^x6?=Xc+o`+E||I6xs+5CP!~7gYlmxU
z5k=iw9aj4>E?amKuh23X%FU>?TSf1vtMTu6jV6+Yt*aNqvkD6fY=I)`=Oh=|M!_a8
zS+~neEHD_%_@`~;{PZ3w`o^DCjRjhNlNm;;Pak@ij#K2+=qgam<g&njSS#-LJ3WgP
zf{&D!H+}go_<M|ZY{Y3r-Iex8Y>~2Hm!Z`}&*XTkip;p{o0i}9F0!$=Ax_-YwhI80
zgixZ76Rb&)zsmM)3E07B>w7!fQgUJ55zLGLxOVm~(~QY0sJQ`*4nlpfn+WmIYHUW6
zWU`kP_v9_;-$m&Otp5VU9zX**EC8Vl#$qk|h%Ed=I7cGjM-3w4Er#npJg0ul4ch|<
zC>GAU3Sm1=i2rPZkcQ~5&bmXD6Eg2JG+5|gjb&8U=(_;D8^+hm)c-+nLtrMgGbhsV
z9j3fGHg0+ITW51r>{UJH+&blXUGSsJvD;<EIzY|?x9C9wHZ=kZ`Tz$RXO$T%9(@~G
z+G(+oQz;Z_9=>GxMynWnd@XlDzirkyoV7X%YhcZ2$??vYqC|xV=j+_u%l9?C%nRdt
zr#iOyK?B8H<VOJ$_B8@j?9q!`y#u3^3Ny<qgrsjw(DTnyE-p6gzYP6m(pBxidHlqO
zSYfPqnzmeM5j2(3v8LNe`{wl9f>c$#30S=fw|dZ*XGh7b>S<kO?u{k9SWt{SWqWR^
z#%<>%UAnKJ*rI|~b&61C0^PQ}!iC^Un=76-qG1e|??lw%XyVMPXIP#F9*)6cBX?;U
zNyYUvmEFwE&2<Yv9fVRt!SWO{-7Jo}s^_1Jw}@!<2(#yjdYWXjcKdV9sAbJCY{3P;
z@Wm1?p&vL3EUX6Oc}Wv4>E9bY${E^DiEn3fw#^zG(i*eUZ<<>T?LVyeER!9bU!sF;
z><Rx9NT-#sw1~YyM~qEEIyDtVM_1?T1J~dNJgeb!4r@z0Dd&^Ega#xQpaeXPi;Vl_
zL?~&Yv<%2F14_GKb5haLlM$u{O7Ac3!<DzigLYJgdJ~N(x3gbs)_eO4K0Z@`TR^QR
z@*)@@>aVjM=*$J|rbb|5-=aaL=#WAm26qffVXX!Kq}y>cNC7Cm-6^Mc${hukWU5>`
zj1CI>^2XH?LPN%!6RaVau3X|3jH$g0528-{!biGAUPXoEXjQuvKMsvfUBuHH3n6ut
zbBJ&;fx5e%Q^=bOn@7I;&=}t%_tK&wFNVYD`*K^bGh_V|(VsF!$;fLzuq#?S8+s6#
zl|QV=3AOw3p7-6WRSvgz1K~<Yrqr&hD9R~ZM%8L^Kf1T>=O<6)&CA66X_tKno^Ok*
z-9w`KkoK)J2ah*Py13rmJR%~toT)D|77MZ2*rh_bo<pUIJty&x!p$2lh`>FFr3slq
z{X=&@n4HAtv}yRE_jz50-KX=O!0Y47z-wqc?oy7zxa)=HGv@GbUEw=$0q^s0Em1mZ
zK(Y9Cs6U9{7||;x1FS$Ji2v3;^^AeT`ee$n$_+eje<_e0YQV})Qnj$}a4?f-@`Rx$
zG$h;vN;w#tasmovX(i58y7d)5TMkcV^SuP@I34PR*V}uVn4o=lz|FwDkE7>5C>6ic
z^^0=d>p5LBzH%Ru@@tkhqIEYk*DpP~CKpepyY?N1>!Jihokg)|ZmCm*X!y=viR9(w
zxniz;^BLTCG2t8`b8ft*KlwS^|3*mT7h^}6A*Us0YH+a3+CFK#JmPTEw`FYo%p-_O
z22%+Ti#&W=D;z&rXpAJQ+PXL9KFp9`=a<*kEaeg!WRCFF8D~9q_9giO$C@AEzt-o7
zy!6Vo2JCs{?dFP&%I^osf0u4B0I5mk_98DCo*&tg9r~cC)RrF*lggVR%E3xw9i!lg
zmK#+<!jRecI(vD6Iu~}>SGWnwWbb}%_cN3E8IbU;3Hl;fIoQe=9QBzReHxls!r)&G
zGK-2+if-cD!IP<<|9lL^?Zf+*JL+@k_{mi64uxnTzK%dsk-Pa4AArO@`WT0Pzd+d8
zchItNtD1uTCD$mp!umm^?YqD{J<vRh%J6L)OR;WfMc#qFilhb_9l`RW(~Uy{t253^
zKVFU!KPZG_hDtGH0_esNGO89$toeOjAd3oKL!wtG6w57Tm<yDN8CJ|CT1S554V1wK
zS<M-NtjxNJV9?msnJPDBop#C~+jyswkz#7pk7r}))qhoKwYoIPjF20r7~~s{;<ui@
z;;U9xfM2(l^l$IgH_$Hdfzcefne60l*J`7=@8J2PGFDFjX@WN5kPPwz#%fHXkJ|J!
zSG^A;Gemqz(-wCKz%J1`Q^jXJl!8LU%I|aSGxoA&z@~D>lpY6)E2Nt#;(3)iFSv_e
zr=2V2u}S_k0xAqUU*DsQixkw&1W63oK8UorCsh`DqZt0~4Fi^4T@E|%XA31-nqrMl
zI*`@%B>W^SjiwD;w}N7X=89F&tg&-FAZcc|38xU)B0EuF8!u-Oq4=*sf7h?K+V^gd
zF`LaZOSHkx_PJ-qH#6fjq?<o$q@`%(+hUSsD?iaAYC)&wpL%Z$DtkbEwA{sVeRPaS
z$bEyFaOf0qxdkYJ+AX2-1YK(@DTbR8V{q_-!S1X#pwl^C2{&X6tq8hTn7Z5DW;Xkp
zN~aH7in^X%gFOro50#fECEf=NR5iC$ZGd|Nz;0c1KXkBPrfbbehVygCL}e{B2>q}2
z-a4wvZ~GgiOE73@ln~f7(gFr4E!{|WcZ-OMgdpAB-3`)grMnR}9kS_u*Vgkr=ltG#
z$9u>9@BVSea15XEu%8uk%{6Py`C05hrgr^Op6j-Ct2=P6iyXo6j&?Su<Bgl7e!^M`
zj5+`HtmtoQV$DgyqlNoOF{sez$ci*cY>0H@{7Q3zlI8v%+!;cuhm*VxT}ZX(8Vl*4
z3?SoY(q~U%2ySIvKYmT^+k9V@{TieyOgSK;V2n_QU2m0azK|0&Y#gt`tZWH+Yidf|
z<Q+Y8yvaR$&%WJrDt#opXO*l&3u5>&*mn&)7=(ig<1zkefEy%OQcNGJWJ`Lbu*K&b
zPIq>C6^^k8#<tXc?%4c#H`Ke_tkfN6(R;Ix!q32CO*o`53jd#vMWLUr8*mst?06)M
zItw>@rHnvgE8YC`ZrW3(Z%iXgae+wZzCzA^=Apj)@-@TH?MQ4iLosAUgLpzIBR>xn
z9B^LzDjUc59g1eopTjK2JhFEvEQl;-C`8EEte6R8bx5N&Of2yB+I2G{K?bKJQL$0c
zBbAUe?m{rp#RA6X^;5oN2XcTk7ZxN4rqEp+e~|Fu4zexAi;>E_T|`@V*eqV`fs~S=
zZr|~&4<Km-nvPUPp@I0JB73kK5~0RC4iZ%T^iVqtPT5(HLFDgwPUgK40kc;AS0^Av
zq(cZCY1@M6TH)on$(wq7c*Q#$v1V`du3+szl-KG8)!^3B6OgJ`-JUa|y$mv6^pL@0
zCI7DN?MUl+9VZfWr5_~)gmRS(j?0tYz7q@?HFp8|T0cLig$zZ=D;hB23|aRA^N%-n
z;8#}Ml5)mpFV-^Q637xZP7yJ>8a32~IQQqSHtWR;!RG--%myy!&ctS(E5`MZO?GOv
zCzn|r79@<!-`B!yLl;2C>C|jrGyy7?hl=~bAh>Ipy6dA)8y-%Ucz|lRW8<t{RsBe$
z(?>KyjcgoJo02NNn$m7h5Q4}ruw!E?%`dQ`dibU*XxmZv7aGLP<BB&|s9i1qhTR$P
z=Cu+?-qA60@Jv^Mw!1e!E%xtG$ZfW4*#h+#XRrIMLW#Znfn}msiC3-KsrD`QEya2x
zFEVt!@%<Sp^b_HdR4Gc~7no&WsA2t)3{rUjO5tndgyk^|ZO_rBEe+cm0T#=|j8~*;
zPZ}>jXbF0P%-wO`8coWt%oU?6Hn#h0#Ra7$cEcNT5E5I<{gI7`K-2n2I9a_TilnOE
z{F1$wbd|_wi|C^mb-r(^e7K=uhPP-C5)y_giPr#^8}_;3gW9K#-S$c1SXGv91hA)}
zuyMQ&9Qu=QER4+C39nhUw-bftQeb2YQe5a^MJiyf!Y?Beqf1eYEs2;SPVO0wmkYO{
z6>cS9evr5hsNY*R41z0*7mtTEEKBr^;}zq_+~G!`&xR-$15bW@kGZ(Wt*-fHrjbuz
zo|&#)1-R^~Mhawo@4(Oc3&=Z5kqK}&<<t^j!XjJB;;a}pTnKWTs2=kJqqU@><`Xe9
zEx@LJcFiS73O_?pb@}NTHBM~+sB-?~1NMg<wtEEc51OnR#^tUqvRCCB<)k+E=<F*l
zW6h)-r5oCjaf&{oSlr%MqXc_==XiA9=Vf;E@%78cz6o76JG;p~XP?}gR4i1V{i^NI
z)tr>2dgFE*AuD2AA-I(=1BwRNDIdPEfEJLO$Gj<UiA;Hp&(%LXG-6l<vfGkyp_1Jy
zph3D11r4TA<L!>ifXy0NlqF;Pdb}`=L6}f)l%FdH2hwqb7KvnU+>ByTiyiU?SL3d@
zk|GT{`pC_EiA0Q%_DW{lKuUYgNr{wSz2a<lR;!L?kE3De!rah|G;Wmht6@gU9L+sI
zW6?IX$=bT+8#U?D1>@~Ka<VrT(o#Ba_IDmyn{S$%N9z=b%^ylvH0cXIqEw2=BbyOC
z7&--GDoyRaS+UghQWu2=JrgNtw9At<cM0|scz{R2#1!G07%?(0ZD8Pm<C>I*5%#oD
zY^HT~9O~Hlf?ZWd?I+sHUweoaAhlDv(HSYnB`xU>(8-P#<`$mij7;#>l<R1ukKy8m
zs*UqC3akHcE4JGsrBpX+nRr^wI0!cks;E?#Ss3h}VF($5>+7h=VKm$e4LiQuD@7U}
zTVQKGP6p}31le_PoUC~UIzKB1sXrNL;P!|rzuLku*xlNmw{GXC5b?^9ujL<N!txw;
z-opTCRTsI~RO*WFx@oFD!>ExJ(VLy(<^Pfs*eHCH@L^)A-8s5iuFeMJM8<xbq#JaM
z?NQg#3_mYc*|sFpatKJPD{5%8!*Xj=AAh?2h4`~6=e$LX)z>3g^U$|YEL%kyY4j0W
zg2*O26%GVlgk^-zfK-&abZZ1Vv_cTH0X90j`b4$M$+D7#!#3jR)3YhmM%UfGaey@c
zEWu_VW5n`{Wy~G*@!F48BWKwOLu4%53E*btW>hZeK?68KH4m8QTbMWNXim;8>>j=9
z?goQl8E=4ZeSmLa#CwCt=nc~7&?Bb8g3>&j@AK<czkNPH*mVP{X5V+wWB(EU%8E3*
zi@OCz=f7}&I_d}E5L8m0Kz-OhQWM^10SikLChWmiCX6nW(o}SxOB*qPI}rU}h2PO$
zf*O@^zE?Tv?3^#^C<vvb5v_jPlSe^?2Efw9aB#A&8_{D3jH>v9XIENvJ`v|Rco~+*
z5-zNE<>RS$F@whMdxaQ#$9#pECRUn>O-xOws;TO$sV+`Uj0@3p7%4|eKYWjj&ZUT0
zN05o=-5)`xNaOK`tgY4js}+dMRg7Smc=mXg2`FFKZAPtFK-Tnel626NQ?fW-Dogr6
zA7fx~I-U>sQQM$?iYkDdP{RemCJ_06qe_rzNv1^yJ}-{cV<*x~LS`J;&i!+P0sz4m
zj4?97@bhOawdg$5y9kyLY{2~WS$T4-MA7nB3W`EGif;ldb1}o?fUaD%&gPGpj8WmI
zN15@R{OraqwhkB77_Z~Y<m0EDMBEU4pL;6@iasf0CX{NF)6gjy9dz9Mn2}dSHhA$d
zF(-YvJY!@lU5SU4u6=l#1vE>{d^%24Ef<;6j?O1)@Gc!WevVo!M5crg+U(nqZev?q
zi1tt?=Z#GD-EB1$EnO|u#YdaNiNy7ky*L#*nS6i?B#F+5+Z_X^dDCAWegKs|M=j5*
z(%Q(>X@oRwUFt$)3V-`zV^i6`@p4nLAkzH6WZOHSXCZm&-}Aa&SR}J+VQ55Y@rzov
z6kLM|9IIA`viyX9o?&>-*V=BqO-{iEX5OeopHFqS<{iW?`lvhR1oRfEZy@j$g(3~<
zNrVJ?MHEaINJ`aXCAsbL0*|WzD3Qyv3==}54(Ah@E-yN;Pv~J!`8C9CqO`1NVtHy@
z!oH+e({`O`{cr?c+6L7Vrc$Ot+Xxm$GA+Tkvr|G|*>#-6k}Q-DAJa!R<hX_oj4^Mr
zGL;sW*eS_vIDou2%k7;;9nD}j^(ujRBW4>guM*-F6Ub$-gfZM9TT2~KX>1`hruhgH
z0B*6Vj>Q1oc9%tJ{7v!GF+Mk)f;?8q8*^z0I5fwL>8>f^jIL?QDbH%k$t+xy9Da6P
z0VV*;Xr}o@2qBU1dijaB%AmeX#^?!i@rgSV#Sd#I-YO}Xeye^fvS5iGDHQV<@`%2J
zP|L&_;TuFc9kr_FEFHD_$~-fsai+;QUM#bvPy?oSCHd^+)9S>cFu=Le9&+zKJan*J
zY49P`)lnmu#eFk5CX%68?R>O^nLhDq0U69fb@4?8m)6dzDs5@N*}T~6plw;zV^%;)
zAUakoDXYo}j@yudRR+XzGQ`gOYbV<>p*jhW7e?eW(S|KAe>hTiRk<BU-R!GqE{ro4
zTbNf1%(I1;R;n}a7*hZ{Fb)RP8-Z44&+E{_Dxfz#qoQHof!>lHOzWZk?aov^A%;yZ
z5&9(`WD4D+f1T;BA(jsB+p@1~1~PSWY=N8R@a#!dh>8sXTM#G;GcPA+b*1i#vyrbb
z&z>+~Vk))R-va8~G?AcG>nV>Quy~#dzEQGp*=h3Eh{?Q8d;bELO&Y8QB^hN$F{SyT
zjAqu{5I4t{$r|UnXYS8le+TSQY-71iP1b&|P(g5R=#fKRGZp*zqs<2-o<QYZW<N$m
zmhSvGH6}Ttrd03o;RA197j8ZEtn49Cl-me_TK=9f%&(Y(9T3H;hJVQy){`--N|UQd
zZ}q-mcW7TnM-~gHE)sLCXeug5DWg3Yr6hep)QDzi?{}X4c_tD*19odz$qiD+@^2=D
zB@+TI=)3qz_%r6)WR@2DKK^colnfD&Dbj;93zYH3bTKesj2OQW51jU`&j{1u5-jN!
zl8u0tXQmeu)PA<YBUZ^i5ic0?3G7fkc1o0*;H?(lWh1K914sBkrTa2HIvCo}(B+<S
zz^5i_-c%O1@yAx?(biEbej3-u<RB#J<W+0zN<-U)1~iJ)!xtN>`Q5CXE^@o?w|}`S
zy8!lDkt@w)d(}l_NJD-S>wM1{hukEmAC4TaGpGNUQVCQEWPM0Q?Ak1p={`_tjB$YR
z3cG59G9sG26AlIt+p~Y<4s22&EZ%sLr2L4%hf><!`=0JslAxix7|9Hk45~3nqKx;Z
zuxkT08A3hIld1Ouj|d9ql{VHli@5`hCIGDD{@q7Y^*Z~rR?{dq$kEVxX{4=gqU!|y
zNqgXxl&*LRe08wbSYY0y7bTw+?85H2DL)1$`FvF_zqG2ez>UMz%Lht8Uf+%F*>jf6
zf2FaoFs%*@0<sZ@-VvE9Q!!0d9}86<B@1J-jfR|)siTN7hRw3{W**B=DZdI8$s32_
z-#rTk=T>UiY%N)1-3t!cSOi*+%XEsk>kuxa2SRGGMp9Ypkq~C2i$CgCwsv<Q7g`{!
z&#Rg-ka$^<siC(x_cg6`Ztn$zQJVxrhHA=fp2ytlm5?SPa2t&zl#wLFW5uF98zTFG
z#~KMFrZ5Rx3@ppaAND<SWvjozG@PBo4J!yYd01Il&x&K#e&5ANo5Zl5lsmA)v1y;H
z#U&DA_AY&-z%vO3D9;I<*Yi`dsD`do3g{k|H;In~^5$<)f9LC>bzl+fL$|fi=%3=r
zFuNV<^`7TZf&jDeAQ048Z-1HSyoT-SUjM?+m9WDPkyXZ@{Ccuayq489{!sXNz<7q(
zjLr97Tq$b;7Ve)&0UQti(>g;n!3?SUkk7C>j8C<%fni0dzWuuzOC|akTSGowdPKGE
zzJHI6Zb@E<iO0S_F5lEI^I5^mPRRaN;_QR^;!1X6Ub$Ue_qLhdI2l<_t`i(dFu_in
zHM&b+EU&uOl|@R#gohP|Kp){9gG5exSTzrT$aP455Q<8JxycS>0+3>4&7jb=E(%5A
z=n-wX(I$cz6<ZkavbQVy_%?F$3*+I`)9(i~^hTvnpgrTCK#-5z#ymNZkH1I}gvdQX
zxrA82H^N@|7_TN32hD0y)A9_Cc10T_ZD|jiU&m2Ei$v}rs~jFG6E_x$kG#>GeJ_*(
zUV?!W5@ZFYrw<=wDD#AUs={757VEsfyYsNf%CZ&!5QMkd+4I}_2~b(?9Vp9kWbtW&
zU5~A^oiM|$K9*i?Cn6(|&dyxSHJkMP%)WVi`DC=HV;6XW?Yd25lv@EcLi2S|+DW)E
zcUH>nfQM80bMxfvm?2SOBO?ZQO;(A&f4Ikcvq>sJ<LETm8J$MH5PK=K7S$CcI2U?(
zn`0o8JzGBpS^~HUo4usI63NXYTNlKE4@0#_H{^U1=pjALs6cG<zwsGah9^iAx4)yt
zV<M$DsGl&%2bP~gQ=wg6F~PCW8LF(RE_XbMfA=oAQrVOn#$_6&l4kDkLvYVQXv$9F
zCO`!Pwm}3|ZjUy-v+~mGk2n16XY=-+O<gADDJeZ99C0?lEN+9J^I-ZRA}0tjN#R1u
zqVj3-b9+c1WU5?d;&Z!IN|npWSM*-t7O{}5+^T|>f|hJQ^H(F>adEI4q}g#saowix
z_O?Jkw%_xR0*0gz4NIYIUY?ER{tmg?lP(DL!#CJYBbLmTWY{Do@B?^4_pger&j}JE
z-C~$8Dd$na<${WD=peLwa3yE6K?AVq!TQR*WTE1uGI$664(tx1r=h47!^rX8OvjED
z-SQqTFM;>M<`IFWLAlpm!^Xo{++Zliz=(SAgkehPBjFP!NtXSKClWs#8AvVnHf;Ac
zOyAKaz_K2H#-ZH@Q#QuXMTxOGS_9Pm0Uyv~qI$WM83#rL+o2GRx#Q-1FJemS-KWfw
z{CR^K#Lx@2wR@9zV(x?&Rz<t~1PULldUSV0onBaN9Wl-~bNPhfx0o>mUx9!5M_E@x
zB*EqE8BcYz!No2FKZnUm@D1@MC*-+`1?b1s=<HEm6-#+8<*?)-Y}pNV{q|h!S`74<
zl0O?c^4r>tV-l7i>G$aUUlj=#Yhf+zhE26NCG2<avZzfiFt@?Gnzc1OR1vnQhgu6`
zM1FPVpD(ktiDOo>BAzL{RzFY4{XhInB8Hvq`9pV5JdR4;FGJ6?yU1GRolQkQM?xqS
z^2aTGO;qM>i!<?y@O|OLH{dbb2EeSgt^<CUrvCYJ6xM(I4e+T$%$*Vxbgox_d=c0k
zjg)tk`bnMks=OdVuurU!B3FvJYJ!LG63Y1T0$^<*3G|P@3u>j!>X(=mX<q)drCa=A
zOj8G4(zxm-QHV}s!(jvxLYi1l5~+|^(FuDg(9J(2a?EhkKKoO_6ak89KEf~x6L)N5
zv1>3!kZ_Ds|CE3=aMS9y!3lTV8H72cI{KN*K5Oh9j7lV0nZVvP0$71!|KTn8*`{po
z<3C}qth6@fw;<6F)QpaA0uL9}0)(p+kQ1Sa0$ng;%@W?Jw0+jg1@T@b8Jr>9T;3_=
z2IKZN=%hZ|4h5+SxwTeQ$L0a%)Y7|80Z2ggmZhN(b~H47n^=wGm_mWn8k`jp&}hbh
zSlEVFqhL7NUSv6r`&HGzyP8J1mGUA;&!+P^3PYo|QC0qt8b2Zy_zB1S3vbl?E*yQt
z95aSWwE|MxcpX0)S9WCm5;v{w5g~_`u4}E|0}_V1Rpv6`_W5N(P|jr>jM7&@VRLTn
z@M2NhRoJDs>k(mDeUnA$w@MvSjh2sQ$e{WlP3#*AQ`w(*8^Ogw2I2eE>0+c!@=?ts
z!l)ZIGXoFTR?!F{#igw$5hzfHcRRMN1HV+r)AsQis3zZ|SO&_OIpj23BzrHkIX*)F
zGuUCAtRHakMQk=SAg(n)K(IW){~j1lRso1vCL2Y*QKkMK_%{LxrnU(pi@>F9lL4Vi
z{RW~yQZQd6jh0APiUm>dfG9IV!hz`iEj-hZ2F<;$eS04jyaW-+k+6<SOz^NwiBJvM
zi121i;Gf{A_of=|TksN8vdPRixZi>YfhJVMVmN~)0K*y3gR&>~`OnXB!AsD!w9wss
zYVd#vcY=z)lD`4aqAXA*m#xkD5gEMnhp|}nEMKF72Sk}%TH6R{0>r05)fyPFGYbYM
zcnKn^^t=2`e&B&(0ZfHZqv3CPU0Ea;Xsm4)4F-4#%I2~tgHP`f)(Ijq$EgX>s@1De
zKoz}eE;W8@(Z6X@HV498sFuy*>AX@zWc+H?C>0L#dHrc4+RuFbFJycxp`Ta9CGUb!
zqYwh(#3U3LKKLgWAR1;GY^7qt5l3mw!Nbkn(XnW$hA3>s`+#fOlP3croFq0!%ioI-
zgO79-X3=NrRguAgP($Z^JzwRYUTlv4V3+cuKyUK(R5AIj=qzJLhBkl*@cyNqhDc1|
zv0C8tLrnCg>I)){xMa(~r;W>K^{=J>5dGjA6%pEi40=pB%&|HE?}M6X4GpFPp+O++
zk+%Scewx@T#FF43R=!LYlB;3+DnC#J&QJOSr!+qP;dGpQ!4T5GbAWyOm!LsuZCvFb
z=Aj(Kz*2@D|FIN_D(as8{U`qawFXqFDoc$KfZS<y8h;o5APVVp1~I|=l}lGmIAY|N
zGdyGtPI&*}6FuejPeEWkda`F`j?fUxqMx`=v8q)hMqKG_X}MM6)v|uzN%8_eNgwcn
z<3Vm!_xvAW1J=`+5ZdZ?`w{__6BQNpo>e6E7U0|XENAOKNPQy7Pf=1-@Ci;QT3BEO
z&I9`fjVVajh(#|t6($xNmTU0bu2ypgrGt%y2LU95_wGevN#j9c?Oe;|rdjGXHR-B{
zueDD7!2s(o{@b%|ZOO2d`<Ha`Dg1N}xz2rEzh1Zc96xnr9Xuo}ww2<AoM;Zs&rB?z
zxD(?NlY{wopR*2#AShRNChhNK!yI7&xxGORAA8D!Z=EFQvvzq;IMN^7|5)65*|AM`
zg-@BfclK+~t9E>BqVXA3aEGS$qCc&=mKp*GXUQWaXA>1YxV3q64^*BR1JSXX?c4|O
z$qH&p^pds}L>E1_jvkf|zT0C=xyaXgR&G!>h13SGw&3}-M)pTBs3`}?8l96gI(1s#
z9E^(2Rx`$yU0nrb*xVDoDh_oMIawVg3w1SCX#sdoiMI9EU?d<F?V=Rt=z~y=^XZZ8
z&EAB53!FS<y%Z41Hhg^qR)I<Fi>rR>bh>vrUg`%uyc$gL+Z1az<V?ktSi$3)FHp(v
zN|}<=7&!Fir$Aso&F-9_Yf>dPl{>(UM19vL?Cd<@jDwa{f+_2)#M_mF&2`IRtmON1
z6+(>cTakH{O4v^II*SX=Zu}GsH|y+##jP$KwY&gns(IG)^1{s2v}G!}608sT-c)34
zD>bGNxcq4T)b?_sTZr2CNV>WG_kjMb?|kTD6F@}TZ*e34#3+qfCd@WIkploy=y%U%
zyw_<a-=cGc!bWJPZOHf1EUG>?%9r{9K)3#W7Hnwt2yMuC|KXLW$l2L3-{P9b&5!y!
zZ%_0AtQnwPtSo~Q2Pqw7-Qy;<^vZPc1|?bVHu>jb_4KarpkIrVn@Z@(*e17v=%tR%
z7sHsrA3y5m_vQt>TD=zUMHa<|MDaa+>M}TS$*uzdEdx9=uy#xXy=FqL%VeTCjc1X5
z-oG8jG#%h!RP0w!zgxMTj>9YJMfZ>~=%D@+&kae1GjHG8p339PdjM<Aqq*g{*5=5)
z8KlDDbWM$o5e^=@E%;E6LA$8Y>q!1}o&>~ce79q@=iAHun~wptLX7)j?aLz-INu$=
z7w4=I|Ip<;=V+o&)}<pYiOt+99d7{S1@K()?1L$c0Q$qr@lf9jHTs@=N@-p}i_gm1
zET8QSfO?YhJD)6R-He&#Yd(YO7mnrawHP!D7vv3#a@2W1)hPS`0|3-GH!opE1)uYw
z@RYV@_}Mpp$#OFX(yRRyQQw@xM#&w5F8I2>_xa9w2!GqUklXEU^Szv$oV~j31)s}1
z<{xND*ur2@qoc|};9P;&@N_wH=gnWqdAY#Yt5kqvOaryr-b%gf@`DSv0;*`)@Nz=K
zNm4HQ6jHr*dIqh%mX>o<&OHbroKRghl5|2tfWBncay?MIxIkz76@Y-OboPyA4YgG~
z4;jDc%M}8w_6%gp>$A1I%2&3lTlrjWhk#a-4AbR2k;7dp*qYhCU5N>G%UC-Kb+gs<
z+s4p&(5j(Qaf~VIeKEc6@Zh;O_TZtCnKX*C=k?mYZKy82x(+*ny=-UgF<a&VGoP=`
z?*A6H<JDep-)2Yf`pr$?8N939+tX94gxCFAwxE>a{h-Xhg?_JCOmN4to!@#1yq{Ut
z`8yok3GCH+>faG?qE}d4*aH#uy2$`QAGmf7<c)8wufiJlu8&XD-PU*L`M)><BtE+a
zm@9>-NOYr{K1$3ej}O=$gzEOG?pwJgu;^drU47s(<JpTq>d%Atr$2tkwl8NQkmQ61
zL)hDqJI(s&0SkzpJ7b<pQ<OFz@LCw%p5I~DWn*Q`?YtY()vy~7Y-KrISN~JHhOH#j
z=exl;sf{cLTF>Vv5lSW~2vgwTizg8cCrd_~V;RM4wRlm_P4gx7cSVo&QomLJa7Gfs
zzAVUD!j>xJcVITy+Ie_2Ha14BcS~UE;UYJG&+n*nb(Jo~i#Qers#wFHojV_7Lv6#&
zMs-iWu&4?T0vNnPJ9B&>eiTLFQ{i5h7lG|G9Tm4{ODkxIjVkJQtdCeiwM5qYSV;hi
za5ix-=ZOqOF26_An>UNi`<OwwElEiUI6*Jm*QR>&C}J@L8yh$`yJ9<2=)c-|Eg9D9
zJom-#K}dvBKfl!Vb0CkM*4aglG8ttVxc5vr5?OiVRtgyy8B_fZV&jhPiNM`8#Z9b<
zQac|`K1fN*(6m*_uC4cGc~SAy{f!{$3rsKZWef2gi5+ly`B$$?2|>$)vZsfXH5TSa
zy+&DUje5U&3doHPww^(YMbDNMd@tsLLb^aX7rFOMSFGKvy`=nth=VxA3TEat*v?=N
zJ-pPZ)3}-nPX(pO7hQfW_~TRqhgYj(D)I|0fLq#tboPYqCQ|tx_5FGsyKZH)?y<0Y
z&;-EbfcSL;U5Xx7l1NCvl%_&tQUDm+juU$c5Wc(Qe7Bo}w*Z#0oPe>l^1N|z`f5Jd
z_vn?I0{h_Muh(w}f1R0hJQj8T{WZ^PIaMb50`s1T<7M|xkhTfL()r~*1We$z-Td!s
zOsdVz-9t|wu!KjT*+m3=VAJJ77N`m+YJvl<op+vmh5!%YZc7^ua;<fHm7h4It-F9E
zUWv?r)l<PNR!Jd}@tLdKzPVqd4IW*!qJD1Ju?vnfjw+I-ND^Sg?eOz5<eP_i{#_3=
z^*)#ibiJ1@7{X=daql2HbE_wcD8wbe2!Tq{mJFUa6cu?O|I3i135AdZg0LgVWyV_K
zVURB2&Ee$z$V`v0!oHhF7vj0bhMNk&H;ZWo0ubh?`tSCRV(wRnZcShEK+)r9#4Fq2
z--@`=Fwm7%#RBb1+}y9?{hj`tngI)Oq{~s6Ta3o+oycd5m|V*QTHz-V1t?^X6J*4O
zAr{v9`7$Vfbd$_8mpZ$(ClPuvD>C(>ZSDcEd@U2KdpKDo{*%-Zg?R2`wE4-UAOCyY
zJ$O5Wq0Ar{j(!XuW98OC#>p!3S4lrm!tDanVX=@_&q6}yLr4FJvS}=kaNduNxASH8
z4n$%4Ne4Q@hlyb3(fi!3>NGMgF7mr@&EHet8yx}>0+7V)9j)-z&)36jrPN!nkTR$}
zHe%EtVK@GwZA*GiUk(dD*gHP{W9q|6+saLWO_*ybh?tua4zQbRV_*E-;juLBDg3VM
z6NvF|<4-tqhl>saQ76^uNDa$2zx^C$iit1*TQ5xb>Wp$gYK6kM^9y%HnoPv#*h)nU
zpjTV75C0*iKkT7NEG=i6SOGBD)kpmGu3r$$Js|v82pk>NQS9aC0bl+}kO<%?`%iEG
z$_+sbQUEbXP4IbpSRnAl&RTJP=ueki^}E7OMaH^p-Ybj{3|a=A!{SZWXrEdkyqG^p
zEm-@d4n{GHSGk|)$|?Cs%e?UvQ7hs;{kxqy^`8bYx2>fGzg`GRD}aQS1P`fEGK4?n
zih%EkK)Ge8D!-j|HHn-GBgfTf!N^0_{P7Bid4nDhD@9`Zy<b}BFy3VHJ`B2PEDTCg
z%-3k;{?ok}j<v5FO6650z8M#20OS9rfqvx2rssYT!!z<SVS=B+Sr&M}x%!)YB840p
zeV*DXK04A;p<BOE0J6pYx8c7E2o@O^FIcl<bu=aWzkL4yo%!<Go#!&6aIFpgR1B~8
zEdy{K|F-Rf<Go(0rSr7aLrK-?zveAs*7Yk4imsq&AjcH(X0sD=3rPGjb>q!Xq5tX~
zIHE{T?1NBFn@bt_{dVJ6>61{Q+`~U!CQN{w88}OkhK%VQ6c8{dXqz;h3_YiwRlQ3c
z>&**-A^v0$U9!N__?vTMj;>;Qk}!R4al3uWR5S!2314yFv(Q2X3zS)`0av|`g^goe
zj*_eWEGzrEFZlzV6AnZlb<7`Pn(^0LM8fG*Ut(w6MO$BVcl!R4b#6*tW6$}L;Y-bf
zXuX2C-d9je3l#JUkmcmz%@SkGiuNu5gfZGcp*Gq&U?5?Yed#3qACASt4C>pvY>T&x
z3-OXqU3R^$ZagMEx33RwstUI+Jd^L=S4_&IY$`@Au7XWy+3YT<<{K%!Qc`@VLWow#
zAI`sIosWg{kw3$x_~nD=#yH%xPL5ohbOQSfvyIQ#4Zb)^Nx@x`yJiH;QY%kyC=M@^
z76pVpwG_`Aa~-?>@E$sIS9KjB**x@#V!UKfxOwE`>nsoN$f;PcCF#KoXnBdm77Trk
zgl7F<N}mR{b+uh3K`uDCHH$b255mWMl4LS{_od|9_5r??z=e<GM%7oWB2Vh=6@t?<
z*N0;a<A$H_W205`0!VevYfSfN#d%dqx$M>rI*O$C`D^$*@hR{<SEmjZ__@L??jhIB
zsS#E_s4T-q6Ao_ssr>U6Qj#G--H@>+u0N$WNoQ&PV3xqsQ?JK>=lxEqO}2euY$`>Z
z5eJ!dmCM$T`Ww`yqL|kn=X1Ka7#%sQgdYs+Sc?1iR^dBIubH%KH1U3CvoFJAA8n1}
z*bC^~MWOlah$6>z{m~qG{9K<z<?iNN=XjaxtcC#7Rr=$LX)mpdAobHbr76w1*;#sn
zc97Tg4TJr2^DT3ZyZ)iB(K6$spJf_!`k~YuQ@Qo^UEOT+wvX6K^NO}ytrj&c6!fU5
z$|>0Pg>rukq_n}<!{4Sv`5I=?7$mlJ;tjSeN6Yy1Iy!~<R=>)$+Rn4W|CA+z?<uo&
zeEmH&7w)I4IL+SlSu#1cN_h8V!V_H1`;b~?!m{*jZ0VPC2a7D#qpJRTg=B%#8_XyS
zyyHX;4;j}1dgb`BzJYokPJZK$3T<Yoe&>0soYT&y;g<k|H9OX>?f2K`m1|dNqni4o
zSlZeoY<gSd9R0KG_VQCG6_eDbd%^O_BIbFE8)>IC)^gmWDf7Bmcd?(PMqmFtbppM3
ztzToG$Cl19H^`CUB?~qeG$ia$fd%B`+rOL^#XZ0#6{S5}|8Z#C91R^zfqLHtN(VMy
zi>988uP7hl2LApCJ7Vja^9bqH(=9BA6si;uUz0Aw$T{8~)xF%56urKBJ!iyNmjoyI
z%W`@FVp=7}t6N$Z1Q3<2+v>DO`+NZ{#mQ(DU~M)SwBnlAqYvjvg1TUupLDC`CaoJC
z@>DdW^(EHnS~vLTj*A}+i<KJEhrh08(TssLH2vN>PH*X`WLxFo;k`-n@C7ug-sZDN
z?xXg*SXxe%ll!^NYYivoOqh$ICOX;+co_`C5m$9NX4fPsy1OtvEno=$X4jH>{d@3N
zWKZ^_U0gT={-@^P&@+qKlZJ~6eZR}=<Woj)Qv$6JM(}$Kw6Ny5#L2O-q%z;k)3OKD
zLGKqDZ_{c@ZVK&8eT1yoir1+58Ma@@8ySrr5aiy7Oa&iBZNmZoK9&e?JlwW>TTW*J
zty#`oP%9jVKz+d%eC&Qo)gOEFu%mK?*;om%EMp<soI_TmI<_v3l;`+tenC%6g-qJj
z?*FDhslTT}O+P-oD8yLftD*MNt_*N;4o+^6@A6<pcS=v)P@tj8xLS~|c#lTRyze20
z?k#NDtgmF(m4MGC<>uH;ciUc9#HRZP9(k(%!_Mx~RL(=MEyn`Oz;+Jijd^hQe1a^4
zU29v1BzFT<Oi{f@p?TUQN4#uzahZwL%r^zmTr0;zwtL5vC6kE+7fTUCFcza;hJ2M7
zKYtP%9`;s))hV~$9Pqw3&)oBU{UPyBeZqkX$Rtc(HnrYs&{w&~faB~eoQ?p!NN2(3
zX{g%eX}gGe={!wi;rLej!i~KE`ra9e&fVBRB&;R|<N!tTnpy{E*`JB83vav(qaW+;
zMe%$s@VuFhQxf%&(X(y!h%=d<7k}uyP3~ayA=mqAWU8OzORYXlG(oM%B&<WS*@yP>
z(fq|ByJm+ZhEgmEiD*V*%Z;nr$eK!wNvTdVSgf^CKl#Q8pRWtdx#M@UsVQ%ljV~ez
ziP`kXngwG@tZi*KMcYDzzudcvJ>PgsGd(sjG2!R0evk>$EH_R{U-|?`IoeBYw4erW
zJQ#7^xqBD8+EJjskorD<48#r(E-(7F1-HV{qjo$+P344oi5~x1>N>yg3l1Im0HGG7
z*j7`*!z5hX$LPpH*us!fx2b0*ZDUMh6D}1J=`uX7R_3*j{2trN8f<-w?Vix}I`QGR
zz-caRD;{*Iw`<G0U5EcT%nV%Eo`C85aI-L(+z|*<%N0s+c(gI^C!2VHCtbREa4(1Y
zp(t%!3tbRc5C`3bv9UQn|JbyeuvM5!d?(CAp}$X0jcu{{^gOF|EOq2Xu6(UyefVwf
z(c>Pi_b(dEt7b~r_G`4xM?V8mOoz|3d!7|8l1`b)qGV5sTt~r%iF|)4SWLH4V<)fP
zZjmxPjm?kIra+NidGqI>XEB$fve^Xo_NKMkgPP`MAHX2;$pU88DNU;l-ZNj~w+W)m
ztEbM6xmzzuT+XMR;*H<rw%_MBo$DJlvyfBnnPvao{Zo4`I!L|r-Vqst`ja}A#swql
zUj#$b)8{g5;UB+b^du3~81z*Tsj=OzlW@PC=pGfU`hC4ay`hQ)t(aFEHRl2A(4y*`
zxHG=6u(wNs75nBx7<(J+>J_ID0f`DKG9Q59mDPGCXJg~TP~5YK9;;Iq+;A0I$N6e~
z{xjnL{K=GOM7~PzpE+#(_&B^WGc(gr+~(r(d|7qh@UbJ?S`AiEtFbQWF9Q3*jk_-A
z@B7~D&HEXoMAsNTj=wbJA-k7NU7Y`b1J`L~t@!!N>!KE?oaW~1W<S}B$vKbk*HFM3
zYdZZVnl<82jt;2npV~;id2=`~y4Nb?`zZNK_cdwlZqr<kpYcQxg`w3pAQECiZnpu^
z(4W+8<i8SqS2c8fd_2xaUj~Oq_IVZ>5xfvdWVW`Uw>S+c2$F;V-evVt)>|AZ6XZo6
zgf?Ceu9QXpRhDh!0S%@Ofr`KU&&Ko2&lg?x9bIlR;Qq(^d`v;S4aPiVJ%0QGHa&YJ
z<I4@eDmc`frk1^MPn#?|pq9@`Jyy;qZGDhQeWPi89M6+E%+F{Y;I|Y9Cdq&iY^>Y6
zIV0oqvZF{q)cqd$@-B&#=!4U|)q|>s+?ZaMm|+iVIhlg!V5Q$;_ov<LCz7Z!ye`8j
z`EC%w7~Q=&KCT#PY9Fp4tEt~gc7w^aALop~?R_}Of<9%nw;8{&Dkjg1c1p%cKYaT5
z1%8j>rASnbdmq;-Ojn_#W1<b#B+kh<P|Lw{zbp#r{O%-1dYtKcGYQm{h|=q8YyI}=
zg1~Txh(>)=u%y$Ey>{AkNp4R;`yOYa;@>a+WgWRc{=HMHn(#^iq{B8}%`#&|9aN$^
z)y?c_-F-%C%JI;Pl!XY#<Q7?Pe7xi+&~8rq;2DL8s~a|oy?~w4kAY6~aj~{g?4d<r
zqNqjT;vyDb<xwvI&){?;qxKTpR-`?<gXhxfuBTVEs#;p_*DEV9@s3BgoY@|XN;57y
zk_c)xITQWWneLCpbO+3xaU>qA=6!~uH@#;j$E0ob5ZwKZlA86Yr7tR%Qm<{&E`<yi
zW6@s;Ilw5KQq&?7)yJMdX8}_|rqmUiZJqUVy+cJ%dVqg2TGB7L9^O}eeylpQe>QL^
zlFlB!19h^q=>Hsi8*Gme`%5o9RvXM-0LvfA4sFRX_T%M7*5w0#s7J_K+azm4a~cCi
zVB}rX1w5$#|8Rrr4Oee{mp@gmdkM+fO_%TAq|!s6t~MpVJ%<N1W38QOAl}hh#pO!K
ziDFB~|CukvCj3{A2`9GnVNCa{9=rZV$7b#N7Cnus=2*Pgkj{}0ImG0R9WdS{gEtD`
zE5LRCNR=gy*bw@P^9O}cmI5F%5`&7z-!-%>!DGAMRzjc7#HzhSzwfRJ=#8*i+sKiF
z>zfL4cr9%*RGS*(^y5+-Jhul>McKkb<z$J*o<`t&S5vZ_S)k5*YlI(3tzjE(w@|ab
z5coF6L`X=81QNp3r1#2FqPxh7xMrlIU_4>hYxJ?v1Z${#?*J>!w#_PRu5qs`9Qu~v
zjh_xxqXl#zhnRMn58^#OPF@RsDA`>EdX-Wm)??Rt*ytdwa@CM!^Y+u1nlKcsDFvke
z!9Wyi@UXD3jWanjuF7gH%1}3w!#gArB_-8QnwDmY<+NQZAXYqNV3>CE^;TT*;Lf6$
zS6#JO;?wo$jI0V1uTxbzZKdXGE@_|@kIIcLoKc>UP1j~s%^7g?u6_>sTKLI)*P&ht
zqT9e#Qd81y%r)QO<YuxX5!l{VVXbji@R)JHg$&m*h)dA-jkHC*M4>K4y;|{bf^1HA
z4sqsi?=DjRAC~;9n?n6NbOtJll)ESdd|kn`Hh)+b4_F_CoCh_YP}7zxFsMT`i3Tf*
z(n<_LA7uCMQHw?;du3-7Fc^+1K%gmtZp#<N`j6GR+?<_>uRY!LFC4T+3nJnwnwv5{
zn`nCJq`1L6J-aPD->W(#G8b5WgeE4W!ZxPs)w$$N$U?heg_G;VI#yOzxyMRK0qXQf
zv4_N6JF+^8cxZER{*QI#-z94m3)rt3%gpHgg0GnQ(r!mHn;v|whCoFizOwkv6*u1U
zH-K*u%gm|`E(isL&E`5kY2M6MU>@O?U#z<(=lk%vGZIx&o*~CtL3X~J!|^yOH`=w~
zYOMZFtMU5COM07m@tl_6pi<=m(u$k4-9<O85@BOUanJ6}kE#c;Z2H{svd-*;Vr{<v
z;m4lFBoobiSt{a|eq$~NYV|j<>UXy46edQw2UdyV7@Ko*cCTyeSMY1gAYR97$=vbN
zV3!m>;^gxzQIK!+K4v(udNHDQsq4E59}kpN7=Bs&W&TNCF{6h-@<vLsc7y|S)Wp|5
zBCCm9VQ&?g$uuOC6N3LKRKSkN7eBL;-gfp(*7Fl#6y-E&92a)oa?BhirA&3s+~7Mr
zgkN-Lv#fODGeIi^Jk$}~-C}UVUkA9S($;A;0Z%QR+tTJyJ>SC6l(3z}muogVqE7%I
z9bBZ<cL;yF%^QWeY`9zq$mr(k$_yEVQb&sn*H%AhB**=b>blqJb*oS)vP6$H^^eQ!
z{uu0Qb(L|z!O7I+g5KhDVa>sPMOC(kjxi=d#Tl!`jeV;plGzGU#zh4s$(8&C6<T6?
z5<QHR{Ej2c<~P3k%l%WK<jOT>Ndt4SzH`2@l@M(m<r>rZwckVz`4(P5x6-GXD|~?G
z`M6ByxehzeSRhc#su?_{+1QdVAv%=-!XmX>sCw`Gp8;XKkP(;s=}tj_KRJPY9CY`S
zZCI%jXKRFPxSRK=`EUpJzvZ^@1dIXNdh~_+$Y_JAgcx_R{qNW0jM}7RSX0t%M!IY<
zy~L#&<cV?2(S4K@ox>DZajBc%?quOazZ>QtOV%o6+<zssSR<@<c_+N*d%74BT9P>U
z>~6BZzZ2-F7ZFi#e~c6>#Fv)o68@LblcKTHb?<KofzHBRurc;@{=;8&bM?rS#a<BP
zx}@4C6K;-=kI+DaNNlGXu?9;6;PVa`4H8NdP&CQpL&FU{_<xvVp#S|48`cpeihX@g
z0~)Fu+kdODIsdSUXE8`R1KNb!LiG<kQBVdRA;o$Xz2>lA2cIuH-bX@S08)AYZ^L}v
z2#gtsYcrD<<|JqrcYz8s(tQ^o6pQrN0r@R~#O7uzCb>=Z`j7NGPM*VhhKy_m#`A1~
zp0o}z<$vqMbqwdKz4E^lt?ZE=LPA19=aqi-T!o9a>JExeFi@}MZ0&8yps7EBm;aqi
z7xK36Q~doa!Puy_@p0$sK~r@UXkz|+V4hB>f?B5<e-*Q4{QVj9KLMQoGnsrC`Ccf9
zwQ2r8(S5B*7HjvN*dp!FI81?gn~1r(M?wRa0*yjUEPdpg>cndzX7o%$9>Hogg@sRH
z;kbSd-cjZcz`)#4w0s&R7E6Y!^LYf(Gruy$Z(Xb<V5Fmdf0%%T98Rw|N>fx~6~pup
zJyXfTnM^kk{C2$d@CL(g6M5NjjtU9cjmE&WeqhH@B2%49EB>8@Mf5k-acr`5r*I-X
zjD<!tw9xMr1s7{-pJdygJY>xr2ACp&(AhcKtd^rKY$CBVKCB2Ka3Hj;Vp;qmJEmV1
z?`H=1eZmLgAeqrdB|J$?WQ9j*iWZh+?@d%)@P0nZ*U47QSG8m{>VrCjQ$7NIi@OgB
z-8=K+UCZK>@;TF3W)=MS45cR}_jzJeCx~M2l=&lJ{eF>FRMGNcXljZ%vX@YoI%b9H
zavQT^LCV=)_P#wU%HSKP;&a|cojxg;Ot}hMWdHlO`HB#W2hs`JT|bL|F{ESN!SZio
zc|fzDO#JoLlNhWn<*zCXG5y(&**X^W?Ilka4cyd}ORTor4jpyG{QVQ}r?KVc*X!iR
z^he{zzkqO(H>*CXmYTn+Jr)g7q-#e(p&4QgShpiM94~`~J}%eLdBkV7YWlTs6W6tF
zPVQ0u-NDP}ZPREQ)~*sp9+m^<Kc?$n&_@?$afOU!JC>^$>d`quz_vQUr`adV7hmSe
zKF7;e?CU!^z7=M2o4|F=pjw61MIs@e(u$=fM1k6MiCp6<oRDD?90`f05{_V6#Mh<I
z#d3^|aXx6wak=UCB*WLfF~aQYFJIZu;Ugi-_!l}I{&a<^@JDOC&MGnB(EyUd^MzQS
zd0&x>M4`G9Rg{p@s=MCZ<UoINeD{;3IrumynJ&49_kj5^bErUY8<^zp8MGgEkD$k#
z<^0!<r0bt4*xWk!<qx?Dh2xQa4y1a_Ul9?niQDwi{U$*}*lQ7|50cj;7R}|udH=RN
zi=P&-HeLH&vqd+u5#QuKP`!@~&U1_p`5K42Hi;Q2{h{-`VW(ej4L3fFVh^PSq5$(Y
z94+Z)V$Jnmn-mONtgRYs^Aew%x=E8KASwYk!kyt?5(fzxNRvqhq}axj^TQu3ieJdk
o5MTKJv&aAc&;Q?(Zs``u`jk7HibQ({h)^VHarsvzFAU!OFJMpH-v9sr

literal 0
HcmV?d00001

diff --git a/docs/fori_loop.md b/docs/fori_loop.md
new file mode 100644
index 00000000000..0c9f85af399
--- /dev/null
+++ b/docs/fori_loop.md
@@ -0,0 +1,114 @@
+# Fori_loop
+`fori_loop` is a replacement of pure python for loop, PyTorch/XLA would enable `torch_xla.experimental.fori_loop` to keep loop computation graph as rolled during compilation
+like [`jax.lax.fori_loop`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.fori_loop.html), not like currently repeat computations by enumerating all execution steps
+of each iteration. `fori_loop` might help memory utilization and might help faster compilation.
+
+User could use `fori_loop` like this:
+```python
+from torch_xla.experimental.fori_loop import fori_loop
+res = fori_loop(upper, lower, /*user defined*/body_fun, init)
+```
+
+current fori_loop only support simple test like [link](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py), and user could try [simple user guide](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#simple-example-with-fori_loop) with `fori_loop` on TPU too.
+
+For detailed implementation:
+- for situation that loop range is dynamic, [`fori_loop`](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#fori_loop) is implemented with [`while_loop`](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#while_loop),
+like [`jax.lax.while_loop`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.while_loop.html), PyTorch/XLA would support `while_loop` with the
+native PyTorch and the XLA backend: XLA::While. Due to `while_loop` didn't support autograd, so it would be used for inference only.
+
+- for situation that loop range is not dynamic, [`fori_loop`](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#fori_loop) is implemented with [`scan`](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#wipscan),
+like [`jax.lax.scan`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.scan.html), PyTorch/XLA would enable `scan` using XLA::While operator.
+This implementation would be very similar like `while_loop`. `scan` support autograd, and it could be used in both training and inference.
+
+# while_loop
+`while_loop` is a replacement of pure python while loop, PyTorch has supported `while_loop` in
+[code](https://github.com/pytorch/pytorch/blob/ca6a0e1348ba7dcade1833d983b1b4ca12a5c1e1/torch/_higher_order_ops/while_loop.py#L69). 
+PyTorch/XLA want to support `while_loop` with the native PyTorch and the XLA backend: XLA::While.
+
+User could use `while_loop` like this:
+```python
+import torch_xla.experimental.fori_loop
+from torch._higher_order_ops.while_loop import while_loop
+res = while_loop(/*user-defined*/cond_fn, /*user-defined*/body_fn, /*tuple or list*/init)
+```
+current while_loop only support simple test like [link](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py), and user could try [simple user guide](https://github.com/pytorch/xla/blob/ManfeiBai-patch-81/docs/fori_loop.md#simple-example-with-while_loop) with `while_loop` on TPU too.
+
+
+# [WIP]scan
+like [`jax.lax.scan`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.scan.html), PyTorch/XLA would enable `scan` for training and inference since it support autograd.
+`scan` is WIP.
+
+
+# Simple user guide
+User could try these three simple test case to better compare difference between `pure python for loop` and `fori_loop` and `while_loop`, these three test case have similar logic: cumulative plus 1 for ten times:
+
+### simple example with pure python for loop
+```bash
+# python
+>>> import torch
+>>> init = torch.tensor([0], dtype=torch.int32)
+>>> one_value = torch.ones(1, dtype=torch.int32)
+>>> 
+>>> for i in range(10):
+...   init = init + one_value
+... 
+>>> init
+tensor([10], dtype=torch.int32)
+```
+
+### simple example with `while_loop`:
+```bash
+# PJRT_DEVICE=TPU python
+>>> import torch
+>>> import torch_xla
+>>> import torch_xla.experimental.fori_loop
+>>> from torch_xla.experimental.fori_loop import fori_loop
+>>> from torch._higher_order_ops.while_loop import while_loop
+>>> import torch_xla.core.xla_model as xm
+>>> import torch_xla.core.xla_builder as xb
+>>> 
+>>> device = xm.xla_device()
+>>> 
+>>> def cond_fn(init, limit_value):
+...   return limit_value[0] >= init[0]
+... 
+>>> def body_fn(init, limit_value):
+...   one_value = torch.ones(1, dtype=torch.int32, device=device)
+...   return (torch.add(init, one_value), limit_value.clone())
+... 
+>>> init = torch.tensor([0], dtype=torch.int32, device=device)
+>>> limit_value = torch.tensor([10], dtype=torch.int32, device=device)
+>>> res_, limit_value_ = while_loop(cond_fn, body_fn, (init, limit_value))
+>>> res_
+FunctionalTensor(lvl=0, value=\
+tensor([11], device='xla:0', dtype=torch.int32))
+```
+
+### simple example with `fori_loop`:
+```bash
+# PJRT_DEVICE=TPU python
+>>> import torch
+>>> import torch_xla
+>>> import torch_xla.experimental.fori_loop
+>>> from torch_xla.experimental.fori_loop import fori_loop
+>>> from torch._higher_order_ops.while_loop import while_loop
+>>> import torch_xla.core.xla_model as xm
+>>> import torch_xla.core.xla_builder as xb
+>>> 
+>>> device = xm.xla_device()
+>>> 
+>>> lower = torch.tensor([2], dtype=torch.int32, device=device)
+>>> upper = torch.tensor([52], dtype=torch.int32, device=device)
+>>> plus_value = torch.tensor([1], dtype=torch.int32, device=device)
+>>> init_val = torch.tensor([1], dtype=torch.int32, device=device)
+>>> 
+>>> def body_fun(*argus):
+...   plus_value, init_val = argus
+...   return plus_value, torch.add(plus_value, init_val)
+... 
+>>> _, _, _, res_ = fori_loop(upper, lower, body_fun, plus_value, init_val)
+>>> res_
+tensor([51], device='xla:0', dtype=torch.int32)
+```
+
+For more example and detailed user guide, please read [this test file](https://github.com/pytorch/xla/blob/master/test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py). PyTorch/XLA would include `while_loop` support in 2.3 for simple test case, complex test case and support for `fori_loop` and `scan` would be added after 2.3
diff --git a/docs/fsdpv2.md b/docs/fsdpv2.md
index 28d15ec6936..fe9b782a082 100644
--- a/docs/fsdpv2.md
+++ b/docs/fsdpv2.md
@@ -1,6 +1,6 @@
 # Fully Sharded Data Parallel via SPMD
 
-Fully Sharded Data Parallel via SPMD or FSDPv2 is an utility that re-epxresses the famous FSDP algorithm in SPMD. [This](https://github.com/pytorch/xla/blob/master/torch_xla/experimental/spmd_fully_sharded_data_parallel.py) is
+Fully Sharded Data Parallel via SPMD or FSDPv2 is an utility that re-expresses the famous FSDP algorithm in SPMD. [This](https://github.com/pytorch/xla/blob/master/torch_xla/experimental/spmd_fully_sharded_data_parallel.py) is
 an experimental feature that aiming to offer a familiar interface for users to enjoy all the benefits that SPMD brings into
 the table. The design doc is [here](https://github.com/pytorch/xla/issues/6379).
 
@@ -18,7 +18,7 @@ num_devices = xr.global_runtime_device_count()
 mesh_shape = (num_devices, 1)
 device_ids = np.array(range(num_devices))
 # To be noted, the mesh must have an axis named 'fsdp', which the weights and activations will be sharded on.
-mesh = Mesh(device_ids, mesh_shape, ('fsdp', 'model'))
+mesh = xs.Mesh(device_ids, mesh_shape, ('fsdp', 'model'))
 
 # Shard the input, and assume x is a 2D tensor.
 x = xs.mark_sharding(x, mesh, ('fsdp', None))
@@ -31,8 +31,20 @@ loss = output.sum()
 loss.backward()
 optim.step()
 ```
-It is also possible to shard individual layers separately and have an outer wrapper handle any leftover parameters. The autowrapping
-feature will come in the future releases.
+It is also possible to shard individual layers separately and have an outer wrapper handle any leftover parameters. Here is an example to autowrap each `DecoderLayer`.
+```python3
+from torch_xla.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+# Apply FSDP sharding on each DecoderLayer layer.
+auto_wrap_policy = functools.partial(
+    transformer_auto_wrap_policy,
+    transformer_layer_cls={
+        decoder_only_model.DecoderLayer
+    },
+)
+model = FSDPv2(
+    model, mesh=mesh, auto_wrap_policy=auto_wrap_policy)
+```
 
 ## Sharding output
 
diff --git a/docs/gpu.md b/docs/gpu.md
index c2678164f4e..de1cf807361 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -71,9 +71,12 @@ source ~/.bashrc
 
 ### Wheel
 ```
-pip3 install torch==2.2.0
-pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl
+pip3 install torch==2.3.0
+# GPU whl for python 3.10 + cuda 12.1
+pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl
 ```
+Wheels for other Python version and CUDA version can be found [here](https://github.com/pytorch/xla?tab=readme-ov-file#available-docker-images-and-wheels).
+
 
 ## Run a simple model
 In order to run below examples, you need to clone the pytorch/xla repo to access the imagenet example(We already clone it in our docker).
diff --git a/docs/pallas.md b/docs/pallas.md
new file mode 100644
index 00000000000..46c80b79f2e
--- /dev/null
+++ b/docs/pallas.md
@@ -0,0 +1,57 @@
+# Custom Kernels via Pallas
+
+With the rise of OpenAI [triton](https://openai.com/research/triton), custom kernels become more and more popular in the GPU community, for instance, the introduction of [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html). In order to provide the feature parity in the TPU world, Google has introduced [Pallas](http://go/jax-pallas) and [Mosaic](http://go/mosaic-tpu). For PyTorch/XLA to continue pushing the performance in TPU, we have to support custom kernels, and the best way is through Pallas and Mosaic. The design doc is [TBA]().
+
+Let's assume you have a Pallas kernel defined as follow:
+```python3
+import jax
+from jax.experimental import pallas as pl
+import jax.numpy as jnp
+
+def add_vectors_kernel(x_ref, y_ref, o_ref):
+  x, y = x_ref[...], y_ref[...]
+  o_ref[...] = x + y
+
+@jax.jit
+def add_vectors(x: jax.Array, y: jax.Array) -> jax.Array:
+  return pl.pallas_call(add_vectors_kernel,
+                        out_shape=jax.ShapeDtypeStruct(x.shape, x.dtype)
+                        )(x, y)
+```
+
+## Adopt the above kernel to be compatible with PyTorch/XLA
+
+Example usage:
+```python3
+q = torch.randn(3, 2, 128, 4).to("xla")
+k = torch.randn(3, 2, 128, 4).to("xla")
+v = torch.randn(3, 2, 128, 4).to("xla")
+
+# Adopts any Pallas kernel
+from torch_xla.experimental.custom_kernel import make_kernel_from_pallas
+pt_kernel = make_kernel_from_pallas(add_vectors, lambda x, y: [(x.shape, x.dtype)])
+output = pt_kernel(q, k)
+```
+For simple kernels, the adoption is just as simple as one liner. For more complicated kernels, you can refer to our Flash Attention implementation for details.
+
+## Use built-in kernels
+
+Besides manually wrapping external Pallas kernels, there are built-in kernels where the adoptions are done by PyTorch/XLA already.
+
+Example usage:
+```python3
+# Use built-in kernels
+from torch_xla.experimental.custom_kernel import flash_attention
+output = flash_attention(q, k, v)
+```
+
+You can just use it like any other torch.ops.
+
+## HuggingFace Llama 3 Example
+We have a fork of HF Llama 3 to demonstrate a potential integration [here](https://github.com/pytorch-tpu/transformers/tree/alanwaketan/flash_attention).
+
+## Dependencies
+The Pallas integration depends on JAX to function. However, not every JAX version is compatible with your installed PyTorch/XLA. To install the proper JAX:
+```bash
+pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+```
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 26f491f6c15..0d0f871b154 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,6 @@
 mistune==0.8.4
-sphinx==2.4.4
+sphinx==5.3.0
 docutils==0.16
-Jinja2<3.1
+Jinja2==3.1.3
 m2r
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/docs/spmd.md b/docs/spmd.md
index 5d6e554092d..00e384e496c 100644
--- a/docs/spmd.md
+++ b/docs/spmd.md
@@ -297,7 +297,7 @@ checkpointing directly to any fsspec-compatible filesystem, including GCS.
 Example usage of the CheckpointManager is below:
 
 ```python
-from torch_xla.experimental.distributed_checkpoint import CheckpointManager
+from torch_xla.experimental.distributed_checkpoint import CheckpointManager, prime_optimizer
 
 # Create a CheckpointManager to checkpoint every 10 steps into GCS.
 chkpt_mgr = CheckpointManager('gs://my-bucket/my-experiment', 10)
@@ -307,9 +307,13 @@ tracked_steps = chkpt_mgr.all_steps()
 if tracked_steps:
     # Choose the highest step
     best_step = max(tracked_steps)
-    state_dict = {'model': model.state_dict()}
+    # Before restoring the checkpoint, the optimizer state must be primed
+    # to allow state to be loaded into it.
+    prime_optimizer(optim)
+    state_dict = {'model': model.state_dict(), 'optim': optim.state_dict()}
     chkpt_mgr.restore(best_step, state_dict)
     model.load_state_dict(state_dict['model'])
+    optim.load_state_dict(state_dict['optim'])
 
 # Call `save` or `save_async` every step within the train loop. These methods
 # return True when a checkpoint is taken.
@@ -320,6 +324,18 @@ for step, data in enumerate(dataloader):
         print(f'Checkpoint taken at step {step}')
 ```
 
+##### Restoring Optimizer State
+
+In distributed checkpointing, the state_dicts are loaded in-place, and only the
+required shards of the checkpoint are loaded. Since optimizer states are lazily
+created, the state isn't present until the first `optimizer.step` call, and
+attempts to load an unprimed optimizer will fail.
+
+The utility method `prime_optimizer` is provided for this: it runs a fake train
+step by setting all gradients to zero and calling `optimizer.step`. *This is a
+destructive method and will touch both model parameters and optimizer state*,
+so it should only be called just prior to restoration.
+
 ### Process Groups
 To use `torch.distributed` APIs such as distributed checkpointing, a process
 group is required. In SPMD mode, the `xla` backend is not supported since the
@@ -410,11 +426,11 @@ The SPMD API is general enough to express both data parallelism and model parall
 num_devices = xr.global_runtime_device_count()
 
 # Assume data is 4d and 0th dimension is the batch dimension
-mesh_shape = (num_devices, 1, 1, 1)
-input_mesh = xs.Mesh(device_ids, mesh_shape, ('B', 'C', 'W', 'H'))
-partition_spec = range(num_devices)
+mesh_shape = (num_devices,)
+input_mesh = xs.Mesh(device_ids, mesh_shape, ('Data'))
+partition_spec = ('data', None, None, None)
 
-# Shard the batch dimension
+# Shard the input's batch dimension along the `data` axis, no sharding along other dimensions
 xs.mark_sharding(input_tensor, input_mesh, partition_spec)
 ```
 
@@ -424,9 +440,9 @@ PyTorch/XLA’s MpDeviceLoader supports input batch sharding, which also loads t
 num_devices = xr.global_runtime_device_count()
 
 # Assume data is 4d and 0th dimension is the batch dimension
-mesh_shape = (num_devices, 1, 1, 1)
-input_mesh = xs.Mesh(device_ids, mesh_shape, ('B', 'C', 'W', 'H'))
-partition_spec = range(num_devices)
+mesh_shape = (num_devices)
+input_mesh = xs.Mesh(device_ids, mesh_shape, ('Data'))
+partition_spec = ('data', None, None, None)
 
 # Use MpDeviceLoader to load data in background
 train_loader = pl.MpDeviceLoader(
@@ -444,10 +460,13 @@ PyTorch’s FSDP is data parallel + sharded model parameters at 0th dimension. U
 
 ```python
 for name, param in model.named_parameters():
-    shape = (num_devices,) + (1,) * (len(param.shape) - 1)
-    mesh = xs.Mesh(device_ids, shape)
-    xs.mark_sharding(param, mesh, range(len(param.shape)))
+    shape = (num_devices,)
+    mesh = xs.Mesh(device_ids, shape, ('fsdp'))
+    partition_spec = [None] * len(param.shape)
+    partition_spec[0] = 'fsdp'
+    xs.mark_sharding(param, mesh, partition_spec)
 ```
+PyTorch/XLA also provided a convenient wrapper for the FSDP with SPMD, please take a look at this [user guide](https://github.com/pytorch/xla/blob/master/docs/fsdpv2.md).
 
 
 ### Running Resnet50 example with SPMD
@@ -470,6 +489,7 @@ Note that I used a batch size 4 times as large since I am running it on a TPU v4
 
 We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/GPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`:
 - Code snippet used `visualize_tensor_sharding` and visualization result:
+
 ```python
 import rich
 
@@ -482,7 +502,9 @@ from torch_xla.distributed.spmd.debugging import visualize_tensor_sharding
 generated_table = visualize_tensor_sharding(t, use_color=False)
 ```
 ![alt_text](assets/spmd_debug_1.png "visualize_tensor_sharding example on TPU v4-8(single-host)")
+
 - Code snippet used `visualize_sharding` and visualization result:
+
 ```python
 from torch_xla.distributed.spmd.debugging import visualize_sharding
 sharding = '{devices=[2,2]0,1,2,3}'
@@ -498,11 +520,13 @@ We are introducing a new PyTorch/XLA SPMD feature, called ``auto-sharding``, [RF
 PyTorch/XLA auto-sharding can be enabled by one of the following:
 - Setting envvar `XLA_SPMD_AUTO=1`
 - Calling the SPMD API in the beginning of your code:
+
 ```python
 import torch_xla.runtime as xr
 xr.use_spmd(auto=True)
 ```
 - Calling `pytorch.distributed._tensor.distribute_module` with `auto-policy` and `xla`:
+
 ```python
 import torch_xla.runtime as xr
 from torch.distributed._tensor import DeviceMesh, distribute_module
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000000..1ad0018c981
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,17 @@
+## Overview
+This repo aims to provide some basic examples of how to run an existing pytorch model with PyTorch/XLA. `train_resnet_base.py` is a minimal trainer to run ResNet50 with fake data on a single device. `train_decoder_only_base.py` is similar to `train_resnet_base.py` but with a decoder only model.
+
+Other examples will import the `train_resnet_base` or `train_decoder_only_base` and demonstrate how to enable different features(distributed training, profiling, dynamo etc) on PyTorch/XLA.The objective of this repository is to offer fundamental examples of executing an existing PyTorch model utilizing PyTorch/XLA.
+
+## Setup
+Follow our [README](https://github.com/pytorch/xla#getting-started) to install latest release of torch_xla. Check out this [link](https://github.com/pytorch/xla#python-packages) for torch_xla at other versions. To install the nightly torchvision(required for the resnet) you can do
+
+```shell
+pip install --no-deps --pre torchvision -i https://download.pytorch.org/whl/nightly/cu118
+```
+
+## Run the example
+You can run all models directly. Only environment you want to set is `PJRT_DEVICE`.
+```
+PJRT_DEVICE=TPU python fsdp/train_decoder_only_fsdp_v2.py
+```
diff --git a/examples/data_parallel/README.md b/examples/data_parallel/README.md
new file mode 100644
index 00000000000..2be94bce14e
--- /dev/null
+++ b/examples/data_parallel/README.md
@@ -0,0 +1,2 @@
+## Recommendation
+Please consider using `train_resnet_spmd_data_parallel.py` since it uses SPMD internally and are very likely yield better perfomrance.
diff --git a/examples/data_parallel/train_resnet_ddp.py b/examples/data_parallel/train_resnet_ddp.py
new file mode 100644
index 00000000000..898983a714b
--- /dev/null
+++ b/examples/data_parallel/train_resnet_ddp.py
@@ -0,0 +1,30 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.optim as optim
+import torch_xla.distributed.xla_multiprocessing as xmp
+
+
+class TrainResNetDDP(TrainResNetBase):
+
+  def __init__(self):
+    super().__init__()
+    dist.init_process_group('xla', init_method='xla://')
+    self.model = DDP(
+        self.model, gradient_as_bucket_view=True, broadcast_buffers=False)
+    self.optimizer = optim.SGD(self.model.parameters(), weight_decay=1e-4)
+
+
+def _mp_fn(index):
+  ddp = TrainResNetDDP()
+  ddp.start_training()
+
+
+if __name__ == '__main__':
+  print('consider using train_resnet_spmd_data_parallel.py instead to get better performance')
+  xmp.spawn(_mp_fn, args=())
diff --git a/examples/data_parallel/train_resnet_spmd_data_parallel.py b/examples/data_parallel/train_resnet_spmd_data_parallel.py
new file mode 100644
index 00000000000..3a5eaca39a5
--- /dev/null
+++ b/examples/data_parallel/train_resnet_spmd_data_parallel.py
@@ -0,0 +1,49 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+
+import numpy as np
+
+import torch
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+import torch_xla.distributed.parallel_loader as pl
+import torch_xla.utils.utils as xu
+from torch_xla import runtime as xr
+
+# Enable the SPMD
+xr.use_spmd()
+
+
+# More detailed examaple can be found in https://github.com/pytorch/xla/blob/master/test/spmd/test_train_spmd_imagenet.py
+# Check out our user guide in https://github.com/pytorch/xla/blob/master/docs/spmd.md
+class TrainResNetXLASpmdDDP(TrainResNetBase):
+
+  def __init__(self):
+    super().__init__()
+    # Shard along batch dimension only
+    num_devices = xr.global_runtime_device_count()
+    device_ids = np.arange(num_devices)
+    mesh_shape = (num_devices,)
+    mesh = xs.Mesh(device_ids, mesh_shape, ('data',))
+    # scale the batch size with num_devices since there will be only one
+    # process that handles all runtime devices.
+    self.batch_size *= num_devices
+
+    train_loader = xu.SampleGenerator(
+        data=(torch.zeros(self.batch_size, 3, self.img_dim, self.img_dim),
+              torch.zeros(self.batch_size, dtype=torch.int64)),
+        sample_count=self.train_dataset_len // self.batch_size)
+    self.train_device_loader = pl.MpDeviceLoader(
+        train_loader,
+        self.device,
+        # Shard the input's batch dimension along the `data` axis, no sharding along other dimensions
+        input_sharding=xs.ShardingSpec(mesh, ('data', None, None, None)))
+
+
+if __name__ == '__main__':
+  spmd_ddp = TrainResNetXLASpmdDDP()
+  spmd_ddp.start_training()
diff --git a/examples/data_parallel/train_resnet_xla_ddp.py b/examples/data_parallel/train_resnet_xla_ddp.py
new file mode 100644
index 00000000000..4ac99904422
--- /dev/null
+++ b/examples/data_parallel/train_resnet_xla_ddp.py
@@ -0,0 +1,25 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.core.xla_model as xm
+
+
+class TrainResNetXLADDP(TrainResNetBase):
+
+  def run_optimizer(self):
+    # optimizer_step will call `optimizer.step()` and all_reduce the gradident
+    xm.optimizer_step(self.optimizer)
+
+
+def _mp_fn(index):
+  xla_ddp = TrainResNetXLADDP()
+  xla_ddp.start_training()
+
+
+if __name__ == '__main__':
+  print('consider using train_resnet_spmd_data_parallel.py instead to get better performance')
+  xmp.spawn(_mp_fn, args=())
diff --git a/examples/debug/train_resnet_benchmark.py b/examples/debug/train_resnet_benchmark.py
new file mode 100644
index 00000000000..50afe7f807f
--- /dev/null
+++ b/examples/debug/train_resnet_benchmark.py
@@ -0,0 +1,49 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+
+import itertools
+import time
+
+import torch_xla
+import torch_xla.core.xla_model as xm
+
+
+# This example aims to provide a simple way to benchmark torch_xla. Ideally device execution
+# time should be greater than the tracing time so tracing time can be overlapped perfectlly.
+# If that's not the case try to increase the batch size which will increase the device execution
+# time and keep tracing time the same.
+class TrainResNetBenchmark(TrainResNetBase):
+
+  def train_loop_fn(self, loader, epoch):
+    self.model.train()
+    loader = itertools.islice(loader, self.num_steps)
+    for step, (data, target) in enumerate(loader):
+      tracing_start_time = time.time()
+      self.optimizer.zero_grad()
+      output = self.model(data)
+      loss = self.loss_fn(output, target)
+      loss.backward()
+      self.run_optimizer()
+      tracing_end_time = time.time()
+      # for releases < 2.3 uses `xm.mark_step()`.
+      # Couple things to note
+      # 1. sync itself is not blocking, it will schedule a device execution and return.
+      # 2. In TrainResNetBase we uses MpDeviceLoader which calls `mark_step` for every batch.
+      #    We don't have to manually call `sync` here if we don't want to wait for execution to finish.
+      torch_xla.sync()
+      # Do not call this function every step unless you are benchmarking. It will block the main process
+      # and torch_xla won't be able to overlap the tracing of the next step with the device
+      # execution of the current step. For e2e benchmarking, call `wait_device_ops` once at the end.
+      xm.wait_device_ops()
+      device_execution_end_time = time.time()
+      print(
+          f'Step: {step}, Tracing time: {tracing_end_time - tracing_start_time}s, E2E time: {device_execution_end_time - tracing_start_time}s'
+      )
+
+
+if __name__ == '__main__':
+  benchmark = TrainResNetBenchmark()
+  benchmark.start_training()
diff --git a/examples/debug/train_resnet_profile.py b/examples/debug/train_resnet_profile.py
new file mode 100644
index 00000000000..158c138dba7
--- /dev/null
+++ b/examples/debug/train_resnet_profile.py
@@ -0,0 +1,30 @@
+import os
+import sys
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+
+import torch_xla.debug.profiler as xp
+
+# check https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#environment-variables
+os.environ["XLA_IR_DEBUG"] = "1"
+os.environ["XLA_HLO_DEBUG"] = "1"
+
+if __name__ == '__main__':
+  base = TrainResNetBase()
+  profile_port = 9012
+  # you can also set profile_logdir to a gs bucket, for example
+  # profile_logdir = "gs://your_gs_bucket/profile"
+  profile_logdir = "/tmp/profile/"
+  duration_ms = 30000
+  assert profile_logdir.startswith('gs://') or os.path.exists(profile_logdir)
+  server = xp.start_server(profile_port)
+  # Ideally you want to start the profile tracing after the initial compilation, for example
+  # at step 5.
+  xp.trace_detached(
+      f'localhost:{profile_port}', profile_logdir, duration_ms=duration_ms)
+  base.start_training()
+  # You can view the profile at tensorboard by
+  # 1. pip install tensorflow tensorboard-plugin-profile
+  # 2. tensorboard --logdir /tmp/profile/ --port 6006
+  # For more detail plase take a look at https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm
diff --git a/examples/decoder_only_model.py b/examples/decoder_only_model.py
new file mode 100644
index 00000000000..f335428d0af
--- /dev/null
+++ b/examples/decoder_only_model.py
@@ -0,0 +1,227 @@
+from dataclasses import dataclass
+from typing import Optional
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+@dataclass
+class DecoderOnlyConfig:
+  hidden_size: int = 1024
+  num_hidden_layers: int = 2
+  num_attention_heads: int = 8
+  num_key_value_heads: int = 4
+  intermediate_size = 64 * 1024
+  vocab_size = 3200
+  use_flash_attention = False
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+  """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+  batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+  if n_rep == 1:
+    return hidden_states
+  hidden_states = hidden_states[:, :,
+                                None, :, :].expand(batch, num_key_value_heads,
+                                                   n_rep, slen, head_dim)
+  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
+                               head_dim)
+
+
+class RMSNorm(nn.Module):
+
+  def __init__(self, hidden_size, eps=1e-6):
+    """
+        RMSNorm is equivalent to LlamaRMSNorm
+        """
+    super().__init__()
+    self.weight = nn.Parameter(torch.ones(hidden_size))
+    self.variance_epsilon = eps
+
+  def forward(self, hidden_states):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance +
+                                                self.variance_epsilon)
+    return self.weight * hidden_states.to(input_dtype)
+
+
+# 1. no kv_chche
+# 2. no rotary embedding
+# 3. no attention_mask
+class GroupQueryAttention(nn.Module):
+  """Stripped-down version of the LlamaAttention"""
+
+  def __init__(self, config: DecoderOnlyConfig):
+    super().__init__()
+    self.config = config
+
+    self.hidden_size = config.hidden_size
+    self.num_heads = config.num_attention_heads
+    self.head_dim = self.hidden_size // self.num_heads
+    self.num_key_value_heads = config.num_key_value_heads
+    self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+
+    self.q_proj = nn.Linear(
+        self.hidden_size, self.num_heads * self.head_dim, bias=False)
+    self.k_proj = nn.Linear(
+        self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+    self.v_proj = nn.Linear(
+        self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+    self.o_proj = nn.Linear(
+        self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    self.flash_attention_impl = None
+
+  def forward(
+      self,
+      hidden_states: torch.Tensor,
+  ) -> torch.Tensor:
+
+    bsz, q_len, _ = hidden_states.size()
+    # [B, S, H] -> [B, S, n_head * head_dim]
+    query_states = self.q_proj(hidden_states)
+    # [B, S, H] -> [B, S, n_kv_head * head_dim]
+    key_states = self.k_proj(hidden_states)
+    # [B, S, H] -> [B, S, n_kv_head * head_dim]
+    value_states = self.v_proj(hidden_states)
+
+    # [B, S, n_head * head_dim] -> [B, n_head, S, head_dim]
+    query_states = query_states.view(bsz, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+    # [B, S, n_kv_head * head_dim] -> [B, n_kv_head, S, head_dim]
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+                                 self.head_dim).transpose(1, 2)
+    # [B, S, n_kv_head * head_dim] -> [B, n_kv_head, S, head_dim]
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+                                     self.head_dim).transpose(1, 2)
+
+    # [B, n_kv_head, S, head_dim] -> [B, n_head, S, head_dim]
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    # [B, n_kv_head, S, head_dim] -> [B, n_head, S, head_dim]
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if not self.config.use_flash_attention:
+      # [B, n_head, S, head_dim] @ T([B, n_head, S, head_dim]) -> [B, n_head, S, S]
+      attn_weights = torch.einsum('bnsh,bnkh->bnsk', query_states,
+                                  key_states) / math.sqrt(self.head_dim)
+
+      # upcast attention to fp32
+      attn_weights = nn.functional.softmax(
+          attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+
+      # [B, n_head, S, S] @ T([B, n_head, S, head_dim]) -> [B, n_head, S, head_dim]
+      attn_output = torch.einsum('bnsk,bnkh->bnsh', attn_weights, value_states)
+    else:
+      assert self.flash_attention_impl != None
+      # [B, n_head, S, head_dim], [B, n_head, S, head_dim], [B, n_head, S, head_dim]
+      # -> [B, n_head, S, head_dim]
+      attn_output = self.flash_attention_impl(query_states, key_states,
+                                              value_states)
+
+    # [B, n_head, S, head_dim] -> [B * S * n_head * head_dim]
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    # [B * S * n_head * head_dim] -> [B, S, H]
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    # [B, S, H] -> [B, S, H]
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output
+
+
+class MLP(nn.Module):
+  """Stripped-down version of the LlamaMLP"""
+
+  def __init__(self, config: DecoderOnlyConfig):
+    super().__init__()
+    self.config = config
+
+    self.hidden_size = config.hidden_size
+    self.intermediate_size = config.intermediate_size
+    self.gate_proj = nn.Linear(
+        self.hidden_size, self.intermediate_size, bias=False)
+    self.up_proj = nn.Linear(
+        self.hidden_size, self.intermediate_size, bias=False)
+    self.down_proj = nn.Linear(
+        self.intermediate_size, self.hidden_size, bias=False)
+    self.act_fn = F.silu
+
+  def forward(self, x):
+    # [B, S, H] -> [B, S, I]
+    up_proj = self.up_proj(x)
+    # [B, S, H] -> [B, S, I]
+    gate_proj = self.act_fn(self.gate_proj(x))
+    # ([B, S, I] * [B, S, I]) -> [B, S, H]
+    down_proj = self.down_proj(gate_proj * up_proj)
+    return down_proj
+
+
+class DecoderLayer(nn.Module):
+
+  def __init__(self, config: DecoderOnlyConfig):
+    super().__init__()
+    self.hidden_size = config.hidden_size
+    self.self_attn = (GroupQueryAttention(config=config))
+    self.mlp = MLP(config)
+    self.input_layernorm = RMSNorm(config.hidden_size)
+    self.post_attention_layernorm = RMSNorm(config.hidden_size)
+
+  def forward(
+      self,
+      hidden_states: torch.Tensor,
+      **kwargs,
+  ) -> torch.FloatTensor:
+    residual = hidden_states
+
+    hidden_states = self.input_layernorm(hidden_states)
+
+    # Self Attention
+    hidden_states = self.self_attn(hidden_states=hidden_states,)
+    hidden_states = residual + hidden_states
+
+    # Fully Connected
+    residual = hidden_states
+    hidden_states = self.post_attention_layernorm(hidden_states)
+    hidden_states = self.mlp(hidden_states)
+    hidden_states = residual + hidden_states
+
+    return hidden_states
+
+
+# 1. no gradient_checkpointing
+# 2. no padding_idx
+# 3. no kv cache
+class DecoderOnlyModel(nn.Module):
+
+  def __init__(self, config: DecoderOnlyConfig):
+    super(DecoderOnlyModel, self).__init__()
+    self.vocab_size = config.vocab_size
+    self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+    self.layers = nn.ModuleList(
+        [DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+    self.norm = RMSNorm(config.hidden_size)
+    self.output = nn.Linear(config.hidden_size, self.vocab_size, bias=False)
+
+  def forward(
+      self,
+      input_ids: torch.LongTensor = None,
+  ) -> torch.Tensor:
+    inputs_embeds = self.embed_tokens(input_ids)
+
+    # embed positions
+    hidden_states = inputs_embeds
+
+    # decoder layers
+    for idx, decoder_layer in enumerate(self.layers):
+      layer_outputs = decoder_layer(hidden_states,)
+      hidden_states = layer_outputs
+
+    hidden_states = self.norm(hidden_states)
+    # [B, S, H] -> [B, S, V]
+    return self.output(hidden_states)
diff --git a/examples/flash_attention/train_decoder_only_flash_attention.py b/examples/flash_attention/train_decoder_only_flash_attention.py
new file mode 100644
index 00000000000..442c27240f6
--- /dev/null
+++ b/examples/flash_attention/train_decoder_only_flash_attention.py
@@ -0,0 +1,33 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_decoder_only_base import TrainDecoderOnlyBase
+
+import math
+
+
+def apply_xla_flash_attention(query_states, key_states, value_states):
+  from torch_xla.experimental.custom_kernel import flash_attention
+
+  # q, k, v should all have the shape [B, n_head, S, head_dim]
+  head_dim = query_states.size()[-1]
+  query_states = query_states / math.sqrt(head_dim)
+  # Our simplified version of decoder only model does not use any mask.
+  attn_output = flash_attention(
+      query_states, key_states, value_states, causal=False)
+  return attn_output
+
+
+class TrainDecoderOnlyFlashAttention(TrainDecoderOnlyBase):
+
+  def __init__(self):
+    super().__init__()
+    self.config.use_flash_attention = True
+    for layer in self.model.layers:
+      layer.self_attn.flash_attention_impl = apply_xla_flash_attention
+
+
+if __name__ == '__main__':
+  fa = TrainDecoderOnlyFlashAttention()
+  fa.start_training()
diff --git a/examples/flash_attention/train_decoder_only_flash_attention_fsdp_v2.py b/examples/flash_attention/train_decoder_only_flash_attention_fsdp_v2.py
new file mode 100644
index 00000000000..387ad7aed51
--- /dev/null
+++ b/examples/flash_attention/train_decoder_only_flash_attention_fsdp_v2.py
@@ -0,0 +1,36 @@
+import sys
+import os
+example_fsdp_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) + '/fsdp'
+sys.path.append(example_fsdp_folder)
+from train_decoder_only_fsdp_v2 import TrainDecoderOnlyFSDPv2
+
+import math
+
+from torch_xla import runtime as xr
+
+def apply_xla_flash_attention_with_spmd(query_states, key_states, value_states):
+    from torch_xla.experimental.custom_kernel import flash_attention
+
+    # q, k, v should all have the shape [B, n_head, S, head_dim]
+    head_dim = query_states.size()[-1]
+    query_states = query_states / math.sqrt(head_dim)
+
+    # Our simplified version of decoder only model does not use any mask.
+    # flash_attention will use the global_mesh set in the TrainDecoderOnlyFSDPv2.
+    attn_output = flash_attention(
+        query_states, key_states, value_states, causal=False, partition_spec=('fsdp', None, None, None))
+    return attn_output
+
+class TrainDecoderOnlyFlashAttentionFSDPv2(TrainDecoderOnlyFSDPv2):
+    def __init__(self):
+        super().__init__()
+
+        self.config.use_flash_attention = True
+        for layer in self.model.layers:
+            layer.self_attn.flash_attention_impl = apply_xla_flash_attention_with_spmd        
+
+if __name__ == '__main__':
+  # Enable the SPMD
+  xr.use_spmd()
+  fa_fsdp = TrainDecoderOnlyFlashAttentionFSDPv2()
+  fa_fsdp.start_training()
diff --git a/examples/fsdp/README.md b/examples/fsdp/README.md
new file mode 100644
index 00000000000..761ae64e16d
--- /dev/null
+++ b/examples/fsdp/README.md
@@ -0,0 +1,2 @@
+## Recommendation
+Please consider using `train_decoder_only_fsdp_v2.py` since it uses SPMD internally and are very likely yield better perfomrance.
diff --git a/examples/fsdp/train_decoder_only_fsdp_v2.py b/examples/fsdp/train_decoder_only_fsdp_v2.py
new file mode 100644
index 00000000000..589bb2c3827
--- /dev/null
+++ b/examples/fsdp/train_decoder_only_fsdp_v2.py
@@ -0,0 +1,63 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+import decoder_only_model
+from train_decoder_only_base import TrainDecoderOnlyBase
+
+import functools
+
+import torch
+import numpy as np
+import torch_xla.distributed.spmd as xs
+import torch_xla.utils.utils as xu
+import torch_xla.distributed.parallel_loader as pl
+from torch_xla.experimental.spmd_fully_sharded_data_parallel import SpmdFullyShardedDataParallel as FSDPv2
+from torch_xla import runtime as xr
+from torch_xla.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+# checkout our doc at https://github.com/pytorch/xla/blob/master/docs/fsdpv2.md
+class TrainDecoderOnlyFSDPv2(TrainDecoderOnlyBase):
+
+  def __init__(self):
+    super().__init__()
+    # Define the mesh following common SPMD practice
+    num_devices = xr.global_runtime_device_count()
+    mesh_shape = (num_devices, 1)
+    device_ids = np.array(range(num_devices))
+    # To be noted, the mesh must have an axis named 'fsdp', which the weights and activations will be sharded on.
+    mesh = xs.Mesh(device_ids, mesh_shape, ('fsdp', 'model'))
+    xs.set_global_mesh(mesh)
+
+    # Shard the input(data parallel).
+    # Scale the batch size with num_devices since there will be only one
+    # process that handles all runtime devices.
+    self.batch_size *= num_devices
+    train_loader = xu.SampleGenerator(
+        data=(torch.zeros(self.batch_size, self.seq_len, dtype=torch.int64),
+              torch.zeros(self.batch_size, self.seq_len, dtype=torch.int64)),
+        sample_count=self.train_dataset_len // self.batch_size)
+    self.train_device_loader = pl.MpDeviceLoader(
+        train_loader,
+        self.device,
+        # Shard the input's batch dimension along the `fsdp` axis, no sharding along other dimensions
+        input_sharding=xs.ShardingSpec(mesh, ('fsdp', None)))
+
+    # Apply FSDP sharding on each DecoderLayer layer.
+    auto_wrap_policy = functools.partial(
+        transformer_auto_wrap_policy,
+        transformer_layer_cls={
+            decoder_only_model.DecoderLayer
+        },
+    )
+    # FSDPv2 will use the global mesh set above
+    self.model = FSDPv2(
+        self.model, auto_wrap_policy=auto_wrap_policy)
+    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0001)
+
+
+if __name__ == '__main__':
+  # Enable the SPMD
+  xr.use_spmd()
+  base = TrainDecoderOnlyFSDPv2()
+  base.start_training()
diff --git a/examples/fsdp/train_resnet_fsdp_auto_wrap.py b/examples/fsdp/train_resnet_fsdp_auto_wrap.py
new file mode 100644
index 00000000000..a7dc5679d00
--- /dev/null
+++ b/examples/fsdp/train_resnet_fsdp_auto_wrap.py
@@ -0,0 +1,56 @@
+import sys
+import os
+example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
+sys.path.append(example_folder)
+from train_resnet_base import TrainResNetBase
+from functools import partial
+
+import torch
+import torchvision
+import torch.optim as optim
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.core.xla_model as xm
+from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP, checkpoint_module
+from torch_xla.distributed.fsdp.wrap import (size_based_auto_wrap_policy,
+                                             transformer_auto_wrap_policy)
+
+class TrainResNetXLAFSDP(TrainResNetBase):
+
+  def __init__(self):
+    super().__init__()
+    # auto_wrap_policy can be either size_based or type_based
+    auto_wrap_policy = "size_based"
+    auto_wrap_min_num_params = 1e6
+    if auto_wrap_policy == "size_based":
+      # auto-wrap all sub-modules with a certain number of parameters (default 1e6)
+      auto_wrap_policy = partial(
+          size_based_auto_wrap_policy, min_num_params=auto_wrap_min_num_params)
+    elif auto_wrap_policy == "type_based":
+      # auto-wrap all sub-modules in torchvision ResNet's BasicBlock or Bottleneck
+      # or torchvision transformer's EncoderBlock as an example
+      # (transformer_auto_wrap_policy wraps all sub-modules in transformer_layer_cls)
+      auto_wrap_policy = partial(
+          transformer_auto_wrap_policy,
+          transformer_layer_cls={
+              torchvision.models.resnet.BasicBlock,
+              torchvision.models.resnet.Bottleneck,
+              torchvision.models.vision_transformer.EncoderBlock,
+          })
+    else:
+      raise Exception(f"Invalid auto-wrap policy: {auto_wrap_policy}")
+    self.model = FSDP(
+        self.model,
+        compute_dtype=torch.float32,
+        pin_layout_in_collective_ops=True,
+        auto_wrap_policy=auto_wrap_policy)
+    self.optimizer = optim.SGD(self.model.parameters(), weight_decay=1e-4)
+
+
+def _mp_fn(index):
+  xla_fsdp = TrainResNetXLAFSDP()
+  xla_fsdp.start_training()
+
+
+if __name__ == '__main__':
+  print('consider using train_decoder_only_fsdp_v2.py instead to get better performance')
+  xmp.spawn(_mp_fn, args=())
diff --git a/examples/train_decoder_only_base.py b/examples/train_decoder_only_base.py
new file mode 100644
index 00000000000..cd99a4303a5
--- /dev/null
+++ b/examples/train_decoder_only_base.py
@@ -0,0 +1,73 @@
+from decoder_only_model import DecoderOnlyConfig, DecoderOnlyModel
+
+from torch_xla import runtime as xr
+import torch_xla.utils.utils as xu
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.parallel_loader as pl
+
+import time
+import itertools
+
+import torch
+import torch_xla
+import torch.optim as optim
+import torch.nn as nn
+
+
+class TrainDecoderOnlyBase():
+
+  def __init__(self):
+    self.config = DecoderOnlyConfig()
+    self.batch_size = 16
+    self.seq_len = 512
+    self.num_steps = 300
+    self.num_epochs = 1
+    self.train_dataset_len = 1200000  # Roughly the size of Imagenet dataset.
+    # For the purpose of this example, we are going to use fake data.
+    train_loader = xu.SampleGenerator(
+        data=(torch.zeros(self.batch_size, self.seq_len, dtype=torch.int64),
+              torch.zeros(self.batch_size, self.seq_len, dtype=torch.int64)),
+        sample_count=self.train_dataset_len // self.batch_size)
+
+    self.device = torch_xla.device()
+    self.train_device_loader = pl.MpDeviceLoader(train_loader, self.device)
+    self.model = DecoderOnlyModel(self.config).to(self.device)
+    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0001)
+    self.loss_fn = nn.CrossEntropyLoss()
+
+  def _train_update(self, step, loss, tracker, epoch):
+    print(f'epoch: {epoch}, step: {step}, loss: {loss}, rate: {tracker.rate()}')
+
+  def run_optimizer(self):
+    self.optimizer.step()
+
+  def train_loop_fn(self, loader, epoch):
+    tracker = xm.RateTracker()
+    self.model.train()
+    loader = itertools.islice(loader, self.num_steps)
+    for step, (data, target) in enumerate(loader):
+      self.optimizer.zero_grad()
+      logits = self.model(data)
+      loss = self.loss_fn(
+          logits.view(-1, self.config.vocab_size), target.view(-1))
+      loss.backward()
+      self.run_optimizer()
+      tracker.add(self.batch_size)
+      if step % 10 == 0:
+        xm.add_step_closure(
+            self._train_update, args=(step, loss, tracker, epoch))
+
+  def start_training(self):
+
+    for epoch in range(1, self.num_epochs + 1):
+      xm.master_print('Epoch {} train begin {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+      self.train_loop_fn(self.train_device_loader, epoch)
+      xm.master_print('Epoch {} train end {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+    xm.wait_device_ops()
+
+
+if __name__ == '__main__':
+  base = TrainDecoderOnlyBase()
+  base.start_training()
diff --git a/examples/train_resnet_amp.py b/examples/train_resnet_amp.py
new file mode 100644
index 00000000000..ae541705d71
--- /dev/null
+++ b/examples/train_resnet_amp.py
@@ -0,0 +1,35 @@
+from train_resnet_base import TrainResNetBase
+
+import itertools
+
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.core.xla_model as xm
+from torch_xla.amp import autocast
+
+
+# For more details check https://github.com/pytorch/xla/blob/master/docs/amp.md
+class TrainResNetXLAAMP(TrainResNetBase):
+
+  def train_loop_fn(self, loader, epoch):
+    tracker = xm.RateTracker()
+    self.model.train()
+    loader = itertools.islice(loader, self.num_steps)
+    for step, (data, target) in enumerate(loader):
+      self.optimizer.zero_grad()
+      # Enables autocasting for the forward pass
+      with autocast(xm.xla_device()):
+        output = self.model(data)
+        loss = self.loss_fn(output, target)
+      # TPU amp uses bf16 hence gradient scaling is not necessary. If runnign with XLA:GPU
+      # check https://github.com/pytorch/xla/blob/master/docs/amp.md#amp-for-xlagpu.
+      loss.backward()
+      self.run_optimizer()
+      tracker.add(self.batch_size)
+      if step % 10 == 0:
+        xm.add_step_closure(
+            self._train_update, args=(step, loss, tracker, epoch))
+
+
+if __name__ == '__main__':
+  xla_amp = TrainResNetXLAAMP()
+  xla_amp.start_training()
diff --git a/examples/train_resnet_base.py b/examples/train_resnet_base.py
new file mode 100644
index 00000000000..b66780d5cd9
--- /dev/null
+++ b/examples/train_resnet_base.py
@@ -0,0 +1,71 @@
+from torch_xla import runtime as xr
+import torch_xla.utils.utils as xu
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.parallel_loader as pl
+
+import time
+import itertools
+
+import torch
+import torch_xla
+import torchvision
+import torch.optim as optim
+import torch.nn as nn
+
+
+class TrainResNetBase():
+
+  def __init__(self):
+    self.img_dim = 224
+    self.batch_size = 128
+    self.num_steps = 300
+    self.num_epochs = 1
+    self.train_dataset_len = 1200000  # Roughly the size of Imagenet dataset.
+    # For the purpose of this example, we are going to use fake data.
+    train_loader = xu.SampleGenerator(
+        data=(torch.zeros(self.batch_size, 3, self.img_dim, self.img_dim),
+              torch.zeros(self.batch_size, dtype=torch.int64)),
+        sample_count=self.train_dataset_len // self.batch_size //
+        xr.world_size())
+
+    self.device = torch_xla.device()
+    self.train_device_loader = pl.MpDeviceLoader(train_loader, self.device)
+    self.model = torchvision.models.resnet50().to(self.device)
+    self.optimizer = optim.SGD(self.model.parameters(), weight_decay=1e-4)
+    self.loss_fn = nn.CrossEntropyLoss()
+
+  def _train_update(self, step, loss, tracker, epoch):
+    print(f'epoch: {epoch}, step: {step}, loss: {loss}, rate: {tracker.rate()}')
+
+  def run_optimizer(self):
+    self.optimizer.step()
+
+  def train_loop_fn(self, loader, epoch):
+    tracker = xm.RateTracker()
+    self.model.train()
+    loader = itertools.islice(loader, self.num_steps)
+    for step, (data, target) in enumerate(loader):
+      self.optimizer.zero_grad()
+      output = self.model(data)
+      loss = self.loss_fn(output, target)
+      loss.backward()
+      self.run_optimizer()
+      tracker.add(self.batch_size)
+      if step % 10 == 0:
+        xm.add_step_closure(
+            self._train_update, args=(step, loss, tracker, epoch))
+
+  def start_training(self):
+
+    for epoch in range(1, self.num_epochs + 1):
+      xm.master_print('Epoch {} train begin {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+      self.train_loop_fn(self.train_device_loader, epoch)
+      xm.master_print('Epoch {} train end {}'.format(
+          epoch, time.strftime('%l:%M%p %Z on %b %d, %Y')))
+    xm.wait_device_ops()
+
+
+if __name__ == '__main__':
+  base = TrainResNetBase()
+  base.start_training()
diff --git a/experimental/torch_xla2/README.md b/experimental/torch_xla2/README.md
index f30be7ff1da..dc5a1fdffce 100644
--- a/experimental/torch_xla2/README.md
+++ b/experimental/torch_xla2/README.md
@@ -4,7 +4,8 @@
 
 Currently this is only source-installable. Requires Python version >= 3.10.
 
-### NOTE: 
+### NOTE:
+
 Please don't install torch-xla from instructions in
 https://github.com/pytorch/xla/blob/master/CONTRIBUTING.md .
 In particular, the following are not needed:
@@ -18,71 +19,71 @@ TorchXLA2 and torch-xla have different installation instructions, please follow
 the instructions below from scratch (fresh venv / conda environment.)
 
 
-### 1. Install dependencies
+### 1. Installing `torch_xla2`
 
-#### 1.0 (optional) Make a virtualenv / conda env, and activate it.
+The following instructions assume you are in the `torch_xla2` directory:
 
-```bash
-conda create --name <your_name> python=3.10
-conda activate <your_name>
 ```
-Or,
-```bash
-python -m venv create my_venv
-source my_venv/bin/activate
+$ git clone https://github.com/pytorch/xla.git
+$ cd xla/experimental/torch_xla2
 ```
 
-#### 1.1 Install torch CPU, even if your device has GPU or TPU:
 
-```bash
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-```
+#### 1.0 (recommended) Make a virtualenv / conda env
 
-Or, follow official instructions in [pytorch.org](https://pytorch.org/get-started/locally/) to install for your OS.
+If you are using VSCode, then [you can create a new environment from
+UI](https://code.visualstudio.com/docs/python/environments). Select the
+`dev-requirements.txt` when asked to install project dependencies.
 
-#### 1.2 Install Jax for either GPU or TPU
+Otherwise create a new environment from the command line.
 
-If you are using Google Cloud TPU, then
 ```bash
-pip install jax[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
-```
-
-If you are using a machine with NVidia GPU:
+# Option 1: venv
+python -m venv create my_venv
+source my_venv/bin/activate
 
-```bash
-pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-```
+# Option 2: conda
+conda create --name <your_name> python=3.10
+conda activate <your_name>
 
-If you are using a CPU-only machine:
-```bash
-pip install --upgrade "jax[cpu]"
+# Either way, install the dev requirements.
+pip install -r dev-requirements.txt
 ```
 
-Or, follow the official instructions in https://jax.readthedocs.io/en/latest/installation.html to install for your OS or Device.
+Note: `dev-requirements.txt` will install the CPU-only version of PyTorch.
 
-#### 1.3 Install this package
+#### 1.1 Install this package
 
+If you want to install torch_xla2 without the jax dependency and use the jax dependency from torch_xla:
 ```bash
+pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 pip install -e .
 ```
 
-#### 1.4 (optional) verify installation by running tests
+Otherwise, install `torch_xla2` from source for your platform:
+```bash
+pip install -e .[cpu]
+pip install -e .[cuda]
+pip install -e .[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+```
+
+#### 1.2 (optional) verify installation by running tests
 
 ```bash
-pip install -r test_requirements.txt
+pip install -r test-requirements.txt
 pytest test
 ```
 
-
 ## Run a model
 
 Now let's execute a model under torch_xla2. We'll start with a simple 2-layer model
 it can be in theory any instance of `torch.nn.Module`.
 
 ```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
-import torch_xla2
-from torch import nn
 
 class MyModel(nn.Module):
     def __init__(self):
@@ -101,8 +102,8 @@ class MyModel(nn.Module):
 m = MyModel()
 
 # Execute this model using torch
-inputs = (torch.randn(3, 3, 28, 28), )
-print(m(*inputs))
+inputs = torch.randn(3, 3, 28, 28)
+print(m(inputs))
 ```
 
 This model `m` contains 2 parts: the weights that is stored inside of the model
@@ -114,6 +115,7 @@ to `XLA` devices. This can be accomplished with `torch_xla2.tensor.move_to_devic
 We need move both the weights and the input to xla devices:
 
 ```python
+import torch_xla2
 from torch.utils import _pytree as pytree
 from torch_xla2.tensor import move_to_device
 
@@ -121,7 +123,7 @@ inputs = move_to_device(inputs)
 new_state_dict = pytree.tree_map_only(torch.Tensor, move_to_device, m.state_dict())
 m.load_state_dict(new_state_dict, assign=True)
 
-res = m(*inputs)
+res = m(inputs)
 
 print(type(res))  # outputs XLATensor2
 ```
@@ -164,5 +166,3 @@ from torch_xla2.extra import jax_jit
 model_func_jitted = jax_jit(model_func)
 print(model_func_jitted(new_state_dict, inputs))
 ```
-
-
diff --git a/experimental/torch_xla2/dev-requirements.txt b/experimental/torch_xla2/dev-requirements.txt
index 4a32310fbda..208f70d5fef 100644
--- a/experimental/torch_xla2/dev-requirements.txt
+++ b/experimental/torch_xla2/dev-requirements.txt
@@ -1,9 +1,3 @@
-absl-py==2.0.0
-flatbuffers==23.5.26
-jax==0.4.23
-jaxlib==0.4.23
-pytest
-tensorflow
-torch==2.2.1+cpu
-immutabledict
-sentencepiece
\ No newline at end of file
+-f https://download.pytorch.org/whl/torch
+torch==2.3.0+cpu
+ruff~=0.3.5
diff --git a/experimental/torch_xla2/docs/dispatch.png b/experimental/torch_xla2/docs/dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcdd5e9e58a39032593fd3090fe1881d433e9a4f
GIT binary patch
literal 150015
zcmb5W1yq&W+9<4ugoHGr<RX=p77)Y*NJvRH(jkpB2m+GQAdLvp-7Vc9NOvQ-Xe5Mx
zzPk6>p6|Q=-Q$k2#!wbBp80eVASWY^b?fe}YuBz}Nj!rqT)T#%cJ11YRWuavjdAzs
zAo%CHor1W?wSs=q^=sECuSvk4C^~6xC85@lDownbI~2p9MMB3Bd5%`!`jy~E*ym@6
z{Xu!T93+wNAN}p`2g1%iK8s|b<c^ZL=H(-dL^XU3z1}Zuf5vzxvEGipw3frw@wKth
z6#_y0Y^OhwZ;j7*E!pmJ&3(?Yuzqh+ATr<$n*YCjXkcGo&DJ^Nso_z5YlBfD{mVy}
zE?o4JFf#n#K4z4Z;Tb3<HKPB%l%~W4^IyUA7sf)SDH*=@yz<|dzJU{kT*t-vw@;SW
z8+gp!)ODhNUmDh##{RFUp+w@s3F}n2!8rr_H!M;j;i++={3{aSsVHEf*_+QA|Grch
z7S#T4v<9df78Zs{VSTj^`ghKFW-xf?A0&DaQL))9wi20U$KOYSeI-Pq77PB~y<Ma7
zUj$YU1UQY^u;fzy`|DICbjqS&GX4ssgu>TBZ_sEc*^GMcM1GC9_JM`)4-%P}f~Tt2
zoR2u+;&?G2d0rl*+!TWjiI>z}OL8nYKS=5<8Ioiiskb0xK!VAUBC$67NZZ4fToP0y
zl{5?ZulLvkLZl7;qS$pLG$w#-8tA0wZ}68BX$mv>!VRZhRhDm8Ocf-H<}!f`y>Wzf
ztA8v8(UuUT3yom@!4Y2V;H;rL{VsuM{=zb7yy3o)vyZluB4u81iKglz?=+lSvPy@-
zW4s;m;;E7N0Nc}|CZ#_J{DT$-?*USJ9ReQE0K`cn$r$i$WPPG3;Vh*-w7@lyN3wkT
z0wxuNeT_>4IbIr&Cz4+{l=u(SbkbfAV5;8YR#HcXQ!)cI4Z(74MTL<U0c3dZA_{P?
z<30-lSCn<;X#NQ~3?H!L@~qMncsm~Wf5e?QOF+;pYH*x`Pj=dWfBg@suSa&KJ^2fH
z0Cl7g>LdVM9P67CKt22~)}`bF2X@7+V*^AYL3hsgD}94gq5^?kPK{I)#bsav3vTXc
zay|c>D}ef-<N_xtJk9k6FWG+vjwg40?m*-PJxU?5K~V}$844XxJf>*;=cA0k1w&ip
zUk|{xhoWHH=bER{xLz^TK;Nv?Xkrn0N#la2X79)u0=WP4u@wPJWgbb65M0MK!35xh
zEaYVZd4PlC(!!~J5e82!UP5Pp=ugZuf|btOd?OkFUP1`orW_G)@k=~_dHg6Ae1Lgh
zfO*BD9JxP%^oFpoi>-VB?vH@L363oQ=Vq~j<BnFchyMHPKQNC5gpL>b+3OludKVj@
zyN?FEEbj2g0$c#Ty76BMIxG>~zT9j74lt(|`ZI(3)@wPYu&^5dRNJo-IcaDFAHV`6
z$AWmeKXg*qU2szNN2MlkmNPh>Hnjfpqaa}$aQl53og7+Nm^e6Kf80E$^bfHTE(I5u
zzP|na6l{xwfo(5lK6d*H_kafu*=iM_1POlzCv+(~W<C59NIojCl%6cZ49Enh9{{J_
zdh`XrKpHr1uoFDigbskymtJTL)Wd(Y`*Ji`S$CYc!Hfh8y9L3ktMm~Z%MNx0pi9OA
z+m;T|^nt2U)<3)ag_|HOIHBe`gSA}%9Eo-*6>#owEI96LU(xixzy8m4{?`DrnUF$1
z3%>yd1Pc|=zzC3eNczmB0$lLFY|1kkaJ#Kh%~ND}>KCw?pvrE96Swm^0G01Mjgma_
zEl8HV_*I?*F8L3MpM?N8hQH?Mg5w3i@ev)`FOo!4d7%R`Sx4}2J0a<G-r^cj`-geq
zsT1J$oVBQjZeZI#{GGZ29%x$4qViUh@&#C7ki)M?^3Pp3UWq`MA|l@_Z~>mZ`xJnq
z&%1daU|<j|HZcH?<%|U27!8e;f(3vizd!quV5PRJ%n@MOKiJ#r1K8DyoY4bTDL^zO
z%zL2t&n|fCJor69-P;iX0Hgu<Kh07`E;x5X3LNL|oWuFwU;pPi_%*<6oY>IMl)yR#
z`Mm`+;DC6I&Rq})-v732H1yzhK8_h6wsN{)^R4M>vv{;+K(`oD`-mI{q?>d=X1*ll
ze*V(~N*4ogn3*UQf#b=+@m>0b+5M0q!BY$T77HxMayU3Z;5dGR=?_`?Y5{&9KXb#S
z0c`t+zrWgo2bO8Z<~G5??t>K%N62!5|Kvj%NE9T*v*Q7}Ljbli*b|Qk00vCJV$O%)
zv45&mu;%y2nHIdPP%zU4u<Re~Ei4Uoz4RV31go+kn%+_-eDDX5a@OGYw`QBnk^ztp
z!T%%nhDN~JBXC?sgMd=}e}Da->tMgM3@Y?94zOIJEl&V5$AOn|Mm8h^!2@6E|IHP@
z=@kJqTSQ{q4q)g%w7M<iRbW4oP$Kxl0Q|900cdAhuCfQJfW-f3bp~v}>TT;%<?ufY
zz#p<=!VP}kw{u-k2W<O?zfJhS1Jw(9Ms#sI3BZcD4Q!=fp2A-oMM)1Hu$=x$@i{>E
zpVpfbEWS$u9{Z<C{aakT>L6(xOMX!gSoRP0ri7Fe`;+yD;4no%)BB+gi>(3NueZh(
zt8#(T_p(P1zQf+!9!wIs^Hj0s7YhhWK}Kq5rm!85(Fd^F;D`UA=D;~ZDAT0!pWxtW
zDabf#I2s0jQ6&roTxcq%1T-vCHN*~;TA6`C!c@@RaGo4oSeOQQe@+k+zw8HH{!ogP
zsmID~qNyWj!80#05_nC**1BNf86z$_+mL(Z)edQA-cgqbzyjD1Pd=f~4*Z{DLj-iz
zozK8r&_HS(mqqKh1js?Uh|BxiD*#FxWQIQc%ux;y)`pIelFd$r$w7Pxgx<~|S1BJ-
zcZ??ulPnNphCue%Chv#_fItdy>{}THLebR!p(ufsVv|-9#eD{ZTneUH1NZ}QO{y3;
z)eJck@>>ES$AkS`DId?P4mu__SwT=V71Bfi9jl$nx$1F1Xq9)|%eA1rcg4Z-CDly{
z0Kz}M*#9(*fIW0S<OIUPfLf1%SJ@Z;rdkL%br?;i5-1G<NH4xT&Q+&Ha)wkLt6+vp
zpfF^6!aEgu!}FEP9|F|mPoJCBLVGzO53jv=G!lUDf0Tt^&Je&_6PpsDC?zh~8(_Wu
z@)z%U-36y+<0e`HGr<5cTicdOxg_#FWZbk}B##KZAa4VZ^aHg<zL_D!`_(5Suc5sr
zq+of7em*^b`2VQ=eo;IeAYU)%L7*@U?8RH?K5ENb2{LGTce@goT8toD!Yl}lAYQl+
z*3%k*kAWo?0^snMO?}q*gSMm<;iv_?eK}9Cb7Me}7Fh6BpnFAOQWt}1jG~kjkfeP|
z*^uw4Kc37bZ!`Fed8`JBLflyv43HoieVC@C>iM&#U!K1Be9dM7BQj<HlkayiE)ir~
z$S_#e$`BHJi2&MyC*s^t3;FH(<a3bXsEJ9ZShi%!4!D2160l8QYlDT;o@iyGl;L=>
z1TfXrK@yYyTz4_`3HddTQ{tvi050=T<8p1_>4TF?U4;jTy+(m4fWG&<F?<FCh*W6R
z4zY~HS8KmmqNuhy)OOvu4^FMmLMgioo%)wh&6)KMYXjE3=9Ttbowmfp2?+8pTj_LY
z+!t@C<8f-ZzBpU&s*&>8eQ}1gYU7)%P`UHLs<Y7gNN()W_EhDe=`M#y|C*_474r=g
z%%Yu{+E{nRA7=%l=$5;Y0g$&5MS*JhK8vfUfe{wQiX^I0xEkc4Z_XH=QCxQf*~)&f
z#sm(y%RWGOkS-rjt%j4-9ynye#DHM}P<KOkW{aM4PqaQh`%aDUBUDyxy8Mao)VR9*
zxkRXIR5|lHfz0~L_1UAD8i&$*R&|3F<1d4Te)h07x@BMJTr3h-oXom*u<&>Jz9Y5u
zpK&|r%g|tHys(|J8xZUvvKdiQnLwvc3|qz}EEHTzwA=dn^me|6^8<!C2y}!{0MjY?
zRF3;59>b(dxr1woOAl>{qH#JSkbogb+Y44aP(xPth9J*pDnXWg#o<J(Y2DPSUo{l4
zHE(QT!cz1FDWE6R^=Zbpb$LUdDe@CO`LT?dih|+iCq^kio_@I7Z{}A{dZCi5)+~5l
zop^4i1+fGweTlmN!lJZ~o1@`;Twa{aFf-9sGSl@QRS6|te$7TsQl0InmZR4B)ZC?V
z1sSlK4#g}vkYV4S2hz=(WuJh?87DkAn)|#xZoNawpv(Rw%N8c*1UUTGT$@=cTDnxK
zN#v+9ky>Z2T^pfF6iP2`tmezP^VyS^w<$$VEXqfekGDz(Tuy??1t)i^masPmKfOCa
z=v?l{zFrKVck!fnXz*;kX8g9lGgU(7W?`ep`o_nn?<ON<AFn4VvKl9PU(LV0{qph0
zeZwH8694*({T_$=Qod#6K$g4CE{a;POTnfF`)?GNyQovIr(Q7<J+xbup0e#>-mYFt
zJ}9`yWfnE#yp}XtYCIrw!X;vGx^3lYE_AsUTE7Of@Z9gBpPfTAUU_Kfhj78B_Bw<v
z2ixD1X;g&Rt@ai>&be0}p5jR{$+KUKOVB47oNtxNKHROTDf&fLbs(E$$7ZTc=mf9o
z)pvI`6b@)g?9o#!fh+hO1|ZZz&YA+o2K;(xvKxj0L%$b_Pba<W$ED|p*7)`sZpVAR
z)jQ`i2#+<b8o+N8AdaWdFw4%6tmKw7;9Cmj*jfx?-BDagu&l{`aVOQ<6}Xh^dRpJm
z0RAK7+wl#*J?AM`(RR)_+;RT?@twuj)F*}*j~4x@G1*Zx><2~bPp0Ra-mcX);KDE#
zx-QS>u3Y%!(XO|jV@NGV;}`rm7!b0$_o_^Oy+c-D&j;%sHxr4f)=bH-$GHSBF`t*q
zwb;p|nrx0FI*R9v#CAqJ&?W>HOAdr_L|C{$>{xIaCezEKMTt<F(<LJ7a2_QdwvVbA
zlhp!oS>s<IBj;Skr!2hY*MUaggKObEiWcsRKTR$EzRYpfxdTIdRpk0Bz@6>u2wT6g
zzIPNJs>gPZ?IbEB-qi0lW5m|S43N7Y4zjox+S_eH9!DJl7k5`XPSCZ&k)!$MsZv_q
zm=n*!&&64#F)hcWH7{aA(;z0Np)eJc^LCyFz2%o`?nfg~#xA)tL{&j*TnuncN7&AH
zDBpHr0)knqK(ysZh`~>{eA|cBL5$L`<9-zFezDhaFw-dz|5UhFqSg_STsNOMy?45n
zGDWQ8rUz6M+vw>--0x7NrS&n>6*xh8kQflF-5FBRaWDV5OfPgl#8PCUicLrUX)+dx
z4+$N^c&NuAHUH1UZ)>$>xUe4glNn>zt>Skl7Pd^*>Pba>K1{A>>zUNDxgDo>?Q#;W
zMaBzicgz~kMhX|FKbylT*}-K2;m$mX0Clf-s@GyHR}nHl9E98!e2E3F`}q>Q{;}|G
zD$Y1r?br)op-#n_^`%qM#qK+y!&Vsm(9q|%*pC?4aiBxi{UBF%<=tC2k-HM)QUz9M
z_ZhQtejWvUWk(m-HesBibu89OB+$xt=TPYCK&axY2nMaEzb7hc01zEdC&Q!lZ<(vM
z8_v|IrIxkZ+>jpJ=AS#25e!mRV}=qG7_V#MnUIcX<jN#+;W5X9O*Bh|o<DO>9FYTS
zcWQciG;p}N87ImN4)CanyCHq>Y#>(5pXq$3wopZ7Yo``bA@p-y_Vsr8n9G$I@7cPn
zr~0?u*iHp}?is}!0D3NA4vxC<hM(J3Q~IHTtAt;gL+a>q^GX6wtY($wE>?N48t!zh
zIEx={jO1?J_(q^+1WXY~!xf#(dCs{$rt0^*O<~!1^%JrE{Zr_%M|<O`v%+Rh=R@~1
z`+Gtjb;OS%+Sq2fF0u){`$odu6z$gfk7ORR)NYkz<P3aZZz1=*uqo8EwGFrOJ62kx
zu;7{XSssml<d9Q;G^(LJ!B|*VGS=whCiN0gf0DdL%jvvtP{`CQT2L`*rmZ$y8xdX`
zbY7eht3Am-6VS0dcXjEAIGGP*EHKfqsxM!Yh*OE|RA^WO4ugYY?ri=q8B|6LNNxie
zbh8I}H9j@#9WjaC^~dL19Qv02>^zTGqF$I0J?p&BYqJPH@3Zn)>ibxti>Frd6{9Wt
zEj(uNFhd=N^qlH1dg%F_ptAp*xYP2%T5$KvjRW6;1sYxv6tQN<{h}f-B1n#gqviCf
z1ninu0hs&T435(_;)RcN&L_v}uo|pwH+gkP1DI?W*5h4(JEFkfHm2>`$4{~r(NS1F
zs-CRcn}!=I5Yemeb4Yz#%881^!!{n)L*_WGcfMVja?-gHqtV~f<4r|J`M7G=2knW@
z)!7*SlWZjez3`uJDK5UC=@S|Hgti71DkQT^B6-r|$P-Vlx900sAGR3AY8ioyq}If7
z(~_~!c^rKVzg5?68yXFLid%{Mrp8=OC%TtUWc)Jr(<qF4NEp>^KVIydw2BgIDeimJ
zEco3MU^fH?|A3n^aj;;@s&NKCV&4Q$t@|V%1hnr>KrSKAI-Gboa|L(~5|1Axc9yi_
zNsR-$hNpJzKyh@xcaL`%c}+bytiE>}SV<AtlVvD5E^rvJ!GJ^pJSf_hq;G9G3*AyC
zOvXM7HH(3al2>#c=pUOB_pEGFqDsp|i8i=FUKg=fb!H1o{Kk{`Wwq>K`|9NJlvT<H
zX%#q;OkU)sA39TJ(K~FijgkBP*`{<K(X_`Eq?lLQ0^2$TpExIC8kh7K&BmJ-kZh(`
zn_rX;Jl;%N{w&ivv7-1RC%0fDNlp?5mnTMM?oLZ;EiwESDTJmmV$mISf7aA_#}0`s
zq0Yx+$=HIe5ICbFf%kjYfXg%*%`;;kN#^qXV}^(m{q9d)>gkEApL18v7_DY8pF5jt
zqaB%E1J&s2nYi6}bva8XSkc2$JFcXnrIeqO^jP*m$!;<YMX#E1l3`up1GPqvpLLsw
z!UPT63?-D$c3nRyG~{L-*FMCJ$LVU}D{#`0Un|nGZn}w5%W%FXbY+T4;Z{UdrpVFC
z=b^JFgQ+2B@Pu-aGxI>qb_`ax%i^O%{v0NCsvQaPK)Csjpd_yH-5Wq-oxCOu_f@c}
zKT_6D?z{L<t;<Xoo<U?*LoW*~*>ZNJq6%&&B~Te8TTzdc19sJ$E~qW-_M=s__hf==
zzttvs6mU;SY8Xsdw_r)>^^CL8ER(#RG|ij<e82HCX?aK*CsLgmhOyW(;CX3>fXfZH
z(=G(hOOgB8b|Dwm?Mw2nr$^Rp!0r~a_kJd)-`CO((AD6@?Qf@`&+Z{vym|VdQy$-N
zIOw=!o!@%?OJEj5&>2+}oFrlD@TRvEpN4H0t>9y{`%Yw}PV-(F7d$)UXA!pQO~93?
zkHu9}MSh9qKojby?d<ltpKq<z<(WFHr_A)5zyj*WH@ss$O(Jd%$CZsg`(DqvJ0eot
zDE7W+xb3y+kHiH<1TrPk2>`E4as}V&+@JdWuD}iUS;8?*yXTn*TJm6xSLGva4z@6i
zSsy3D{bU&Dz3Dy6Q=kq7Ld9qGWQ5zl`E&cVgx~bM$}1a^oUJuF{Qk=?cBZP~Rg#cN
z%1n0Y&P7EqPLgAv5+3(1dvwE)>D+q-{n^%1JUzPJn#38#?6oEp$>9;Gq1-07uxdE{
zd}47HdBSo+`Tgf2aijl?j#{l8peZ>ByUGW1&isW1Am6^-+`(L%Hk{VEv4c>JSFWQb
zUT$#<oo$Uunsu{|xpLaJ0kyP2pSedfCafMf8ihQL`W~IrVYWD0({&*!c37y_`6#>U
z0-b43Z?%L!91o%ly>%ecZR|c!7yosIw8FrnPmgS)V?j`)ri4y+_5?U!WBhgBneMOz
zzvcEJH-6P~5p_!caWMOx^q*w|2FTY{_Qlwo9+8&eo^!La!oSrL0*o$21%P7eAnnUI
zY2#Y5YfgKK><a_E$f4&u{)q1TFV~Km=HlMyNmpe#W?1vczZJ~xY;FxspJG7{V~ku|
zD#tAY!O3{Sn-V?Ziqy_#F$S2=lmN>~1`RUaNB=YtszKT1eDrtS0gp)DXJUN}O!FZ^
zrjaQ*aOToq>_yoDUOnM`x|^az+>W@Po5j4Xn+40gB@EVGSeIse(WHMAr50Pi+*4*o
z_|?)ghu;jvpqHe^t`$(&sdf{><(+k2BSVM-LgtV;FNYU5(cr4>Zm9xMk|ZZ*O2$=9
zV`5<2#ik|I|B!*NHvMz>p56yXov%*DrD(}~cArm{vJkvKMJY-;-B;s`gB?Z3Reeh3
z99Q7Zj3{;v@~e<j&YB`Q?u6pK0~GDHJKU1AIqW8L=bMzJ@@Cf&^ezg|i{i?xHAh2j
zNVJoIXphO~yHq~%O3PQ#s25@^*MUjx58|pAknmV0vfsAVYr}c;p%M3C;!}@>r*F4N
zcI~kF9*)r*GNS6>JY{%WO+}@LE3td)F>tf$pURV3>yZV~bK2>xdTj|Jw=~;c`(Ag@
z&#^i<jNyes{ouxd%<53yeO!z2d#QCivraAi=}%ixHu}mx;H&Xs!Y!Y4!|vWVcrv=(
zd$b!|NMyvz=R6}12flaUc&=F_8fVz=NOP4!z+#-LtcvuyMOyvYM((U{qB33Olf-Xi
zrh!ZrehcefzAf>495SiAP1Ntlo_`R*3-|47z~UbNlF-Q~mKeW^d^>X_u8w_u8My-E
z(_1;w#pPWw133e|k`8#}3*N*tJ^Z+kX8L5OW_%?a-9*u(QDcq>^n!yor<W&<2<JRD
zJ%dkC_!DC~p7q7ox9D<=6K(v~O!2wtS}$xWTCn-YPZ~}H^>0MlG<Vt7y-URDaFDj&
zQ$SC^SNmXL*PZl2)`>g}zcb0;C9JP&UzrUQUDgd{3Kk|-nk9=<7md;AZJ4pbecSHN
zJ#9YJa}p@B#?<(FBlHAi?!cDZYQZ(qpGoMnOLl7`x9~V{90Z%eb6(NGh)2l`s>A10
z3?)(cFEvk@RJG6~;kLGky!zoGUxOAFL=u`2;*QM8&OR!3nfOV>I4RU`cpIh%5R7Xd
z@kja-tQscpJK1Lo2u$w$-3t(eCh<i6F;mHWc4uwjGoa$#tRxNY4?R=SPzN#pXm0+n
zqr}vMaN7Iq2O@Hjr3DHyM5Cd-=i@hYvASLEwrqY#NwRK6KUhiA&+oiW1iwvh5`{tB
zuKC`9VzUwW#+~@uY^#mEglZJ;K>>F8bQ?}opk~uXSbrq+?wT4s{!TO(=c&P-x{+4z
z0Is$whn=p?peXUtXijf_Qm2TTK+0Pm(or~$$Q_Z^CjB_>Zg?tGI2u7)w*zrfAaWqL
zX>=#v%#07FJ{}>r50(sdM{yaPh|XPlTC>EQd+Ni*&5*C0{rE@&nA3i77H#D;Fe$k=
zy5B-yY&?lIIFXjQDkAxgCt^25p9p7TD6yu9)Dc1Vl^rUw>G9d^=SLg{il^(oh=}Cx
zM?=^FrQ~{Lu;0C&!k~||ZY%tS@Ck#7v>Bg7E1ZbBwf{OQDF{s5`ss2QF=DOYvAgJ2
z^d(hDCa4RE)5~`YQ`3{hZO*xZ#v}}~Weaz;=3Vb`HY6;s_?w&hGu?jtL-#(b;`k<u
zDyzstWc=AuN;uPdeF-?xEN$XOOWJJG!rI>B77k=;nx}HpE5=X4GZLR|c4Mv%pvU_1
z`gQKlx+0uo4`k3*Cj(l%%y*G&Kh}047LhY-PYzgCT50cL%MV}$GWm1+c-SFwsvNq{
z93bB;3?7pgsy!kL3Sl7F&hcQ81u@3{-~yaTfEz7?kQ4hhjGOW`4qf6Ahq(~^7Updm
zk#{$Ji9PzCh+uEv?!;6_@-yJmmRRFNv^eGA@@*0KV|6DcJ21U7*cEO{>=da6ft^j}
z5v?}6K*BL)I`Y^B8Z%sVUXJq0n0yOPgw0x_Oc@-<QKtrl><Hcat|Sj}v!EIYm&lnW
z$il*Y=f2>MJojW<;~WFxs{<vNnuRr`qzo1gOZlI-zyku*f3$Mo3u?Ye;*nu9mP$MW
z#%))-DaaT7KE3{U7^Y>EU&>J@b*NM3HoJ^DzDS2J-;%Ljc*b<LV?i4RGJA8<bY`J9
z`ckOiu_|jJlJfm0ocQbcm9vw%x9Pw0P59ia;t;UqHT4On_6u;}(}xA!tHh5;v&}^Y
z?&d)DCmJNx#LJ#<T)@wp)$sZ7@>3!L{&gIa{saZ#%^Twfyt{5^3%K*l%}rlNR0Og`
z!mEm6C{JROL&i_vbGZIZiXKBv4t9YuCnSHnW$D$?h|0-hSY^PuWv%&JfCt><L>aa(
zyWXFyy#4OD^)OdPdHA5>1bMd7Il@Zr;>_~NpatTUeA>Bvi^FN|=F>ZVb|qm_pG_b+
z4h3>NX`<q)?)yuQhpfuWK#||pOvWJlHT_3<J`?<4mM>J|KKLRo)fbFW2eFM8Z3&e-
zK$p)_7|X*jP8E5b@Eb;F4nTzNii)g8VQY<N2g7(N>m<}1!0lurPFC<zhq_8-9v)L9
z*J`J#vQ*q#LtaP}ho%T+-tSVmLrIPB-|9TA_8zeo_o+9Z*tj&c1*SH>4HK}NvUf#l
zbQ7Q5U+eBMij#7Y#q<;qSya~yzq!}mwj0`Aj&meHXYVws8!SZU)9($#$h5KUU?G0^
z&UM^}hEz>a?TJ-7NFY1itCx4yeZa04(bF2=bwBcXRc+Ygl^qNh0?N^<v+B*aFbro6
z>-$!Qk)1>3v-HDc=ns>4o4j!b+nj5tii+YWI?hvP!FwO!plV*?h&b#eF&q1ES~uT<
zp<#6M&P`M$Gd&59eergrp=US}aS3wNQc84NB-&;GBq`PJSe21qwVYygee|lXF3<V3
z_J^ccCqPX5P{)#tWwDHkFL64>wwVdxLL85ky!<7{=*d9GbDo)Dsk{v)h7up4TxCn`
z#~=o8$OL}1Awh03?CY=W;KtlO@U&?t$ezd&H8-PR6`I${?>?lmcZg!=fu$4S;-gji
z7L$AaJQ`a&co=}ErYcnWd6`my_pYxi`A5yYIx)E?sx~kTb9ER!a$f*9$PbvCCaVPm
z;;VIQH4Z#JcRHA>dY2S&@@9tt59n0UjpyZ}$nnCEI~Kq+_!ez!nt@o2j@n|jqnHy;
z#Jz>iu_TjTR2+CB8dba??w(Zqv7=p^G>EZeUPgM+FxY1x(Z9e*=xi`lGLl=w$wr|=
z4Om;d@c@Yxpr?|X@?CQCD;{Wd`_XE1_glLfy^Z{~Z$SiGROFy5TV89z(eN+T=)*Dc
z!z59t31)jHJ(_e=G)6M{(Ma-(r~%p%`+R!b_2~d^pBuJhRM?S~T_hM>FNIV}T|jE;
zHS(tX_UXa^u3=g;w_hOusYrGC=QlzOjQdZ*eW#^YW;+B*<2#$VmW>vMX-@6CNe~4|
zuP3{D><9^d?rFlKY-io;8W(Ap4SVOS1NL@olU*<P8A`%7+k9c0w_5UMKt|jx6x2>8
zO_bc>l;n^fwNzfiRTu%1XLC!-2HA=^2c%7maW_IqZy7Y)rF{brU~<BIf%0zK+k9l6
zW5&I=1;;h9vsv$M8W(cNDYtfS@Xa(9MGpFs`JbeOC|W?!z0U+df+wgQsyxS{f!yYU
zaKp<59BI<t%unK>+%m%aPYuumiC0h*@4|XOEV$y^&^+FYL8pIv?Cn=2r_!ocgvB&&
z3)lNLLN#xRk6kT`izCV|C{(WF&sv{}$vUwBo8c?KK=ZlK!0BFf-!pExQ^&94J_fdT
zxi#w~F#YGf*`79kVbI5uBx9F(P)*7<a>rI=wjB4wrai1QwG;0$PrX|7LK7ynT`&3L
zKG8G(f)584LAM<J>4}%>BStg~nl_OZJ}T?WOE~3N=WSR!+lUe&>-p#fI}&1yr2A$0
zR)#^eD$pdFqsb5-ev%bZ60NM>g`X$!CU9ZIr*m;%+@|}Xj7Mhb3E7xk3b?Wv;hS}0
zs^F(qtNniqW*k9wBc$&mSzWtYEgs7~gx+W{=`x0TRHw)tmg9Rk9J^w-?;Y^2YWA*m
z<&PEWGzP*j4CU!Ibw%8LL)<(`TbuabaTS3M7JV_tD5$XX3F@Yh`t4Z^IrE0;`%I%y
zxb1qKT+igkQ5x5Eq*CSOlOAW|9($lxFG>XTQ(7I!PH~0kO{Yu*2jlNB6?BReAlVWP
zn^~OIxM4-%5~%%(BIly>01wtL%Qg8<`7#AQ2z@<YPjCoQ8L3%>(K;ELGu{b3dREck
z;Q@Q8V^aB~2W~0wm`Rt=lWSBc&t3=E;9(E;(-y*em+M#XvP+9#=z>?Q#zzEd&CRGE
zvAnC_caM0rZV+ob>8<Pggl|{QHNeuVfLaUH^nL-~C_ob~5HSs4YC<`14G6PBW4TCy
z=k<3!CF~_5!Xy^+AhwlMDrGFFN+WRbPE-duddGE8k{$U@cy0&kfjc_o1yxW&li5<%
zq23Ur2ROEJI-BteYj+;WYEmr|BVq78R;%=DseTl`GS+zMaQ`;t0e36@=#ji$V_ZtI
z=jCa8{9s=IqjO2sz=w{~*H}~I#rpMME8mg3WK|AOh0~sX7$k#oLxbNzRq;FCZ^Lho
zi7FE^|ML9DQmCsCrllmPGkr>)_x|dITM`t)Q2CLBD{F(f1o>&r@FV}0h3f8Td=&$#
zIi!T@JJm!e*rXq_%zTjfx68gso}^Qi$nnyLST_EAZG|5nvN}(f;mTXBvx__*+H>Qn
zKkWy-Wyz|IoZOe?p?$BoJ=p0+nvng0#F%%torfHyB5Bfqz?puZT*S$m{IyT>(;ryx
z{9e|f%9i13hx#g23y1?c0Pd*kSoFEHBj7E6XDzHv!X!f;1}M-Rbs*w(y_s*T=4A+u
zn)ea);}gEM#20v}rX3)?FG+>IS7M)rksRd2rp!U~V`ruv!PhLq3sRLK+QmY^Fl=2a
zZ3cocd2?j(SaD2)4m6&$xcD=XKBUQGe)n42d4<rbVIz$K*&P0iZM545&CVHx?}vCN
zE4Bl<O)GBubz<W@wSd+y*AsMq6iUJ{Hj!3shwp!&_j`!W)t^XOXl>hMJ+1)!KK4)i
z-B2&uAk>RSizJ#*CTXyK2OC5_Vwu+J3%ie=6r$|pMA;_~Y%-T@RnB?LN{|G<*ktkZ
zD8Jq(VNDmQI^EX~-kUbua$9ICIz-s}L_!6ad0N#>wNfKzs`hdJ3#cZB_jH=ehk;s=
z1TY@pMnpZ2^1=9)5eES^300<FA_%jCLHZ((YGfTIvSriUH(}**aF117JSv<q00*hR
zipLDMbL|LE4Nb%MLsRx+OsZ&61I?%7_Iw67xHt`rzV(5OC6x1F8Ub|@chKir&}Ca3
zXf#Qv+JDF#yc4wm<x<Jm5h;Tw@HIHi9O`UtVsM^8qKCot_c!a^{n2;!{q|-Te!l2J
zqwo^m5x_%#;=J308fzKeiRU;>bnpli9lwuN8;w3*%Ce~M%d;vEI&`K2`nq1-Uq&Ju
z7f8n2{CAzrFVLx^u;nnR-X=2X*=f@(uW7ohXJ8EwVgF#OtUnx*st?okiKesXO2g0|
zbIJy(fzc%FZbdvbC%RWJFN5bx_S)}F73sBX`4VgI)>XvCQh++DSo%%r{~t1$QegY&
zE+7~6yn4gWoay`4FB4-S-9I{Xp-qJ8e11K0{s{0V+G)#YTBPgk5=&Wd2l==BiCcWX
z7*xvoM(pz@2I3VOohnc)SU3|nU5=ck8THlEe>=ERv%nGCfn<ZMvT4?i%vVJfye4fu
z#2lW%@dT&`F~zk3iRAajQBs!L%)w;dND137K1=W25~0+6D)d36Rq!Lw-E1A=Cp$Cs
z@C_!WrMrvQyWm>)a_cn~1DOOUnvms@o?16ag{zLO1UueXL?w!j&DI7i)PS!%3NmU4
zM+Lr6>J5<U>YOP#T}^OxSU$l=Fm+(xe%#z0T{@4rf3^WClAFs~^x?ESS<I^-tl1wd
z9QZJy&Ys&nWog54&(O#KX*_5SgB>wqcnT8rM`j+KJPn^A6;>GAx!CN1`x6PVkmY<Y
zdBW$yHk?A{(R%t2X<{{>UW>N-k&@MI?{mJdEwkPjR-5W7&kYtHbyImGB~Mz)zLl%b
zBlNZdXz1z)-eCZtQ;-<c1o@ZsZre(7oZX`%-x=mg2(=JFk!7rX(qe$-s2eMt{QQwj
zBHTAIJF^py$sAOgj%7Z>M_A3QOS>dtuZdxdGUK_B<`$C?JW@myJ9T~W4yloBr=!Q0
zp$i8|`Xt=uPPKYC(WG0`O=73a<~ZvJ$~wHlR2)$>&b1_bBI##Lyk-O%_$wh@4|kUC
zL|Xd9>8kPtFeOb$7zA7~nzjd%%&v1@hT2GC4H|xVbDa=lf#4qfqNa%bEGnoNG_D5S
z(_#%b1R>FGgPm%kwT^tW__MV4cQD`$;tRZEI}J3z6~BRga;LYp&6R0`%wxhZ);W_P
zDSQr{H8K^*)+tcf^(UOkw0G!X&MIPRUt27_ZW6K8z4}o<AN-YIm~crjh#}M(pL^EW
zi3>L#>C>F1KaXQRD7eoJz_dX(1Vw#3+yK4OgwS?ufhsBVb50C96rtWNonUs3gU75o
zoc}zTL-=YK_TR<CB2%JXRrhD;ooW{Q!6p!hPDOU)JVaEpkA~H*UN-bGDcx|>1s4s{
zZ0!HhY!x+~&8Bmpax#dI%ErAnKCWMyn^>&pa5yL)`<PDp_JT{ZvTV3-h|-#<*R|$$
z6+CLi0Vwf?hyg7Zt}Ja;%(10_L@&|P-cH@gi#hOJv(1@JDxkU`d1a@}8t%If)oWSB
zSp57;`nac}URlW*P}#qqqFnWZvYO+;t=!YrEkoJIS<N4Y_osm_ojP~A`gwBFnw$#s
zpR^22s?O>m;sKGh{{z)R?fSZ1hh^3P{Z?;mmo+kw(AIYQo(i&)v-r`rGQKI*6GjQn
zh&%0<AX}u}t>Y`#gd(g!-Am|TubiC-XtlD?g-5^!JnO{(ET?z7+HmPL!O-y?Oj+^S
z^#xxuz)8t(QIihRmgZ>k*R-S?zfw5;_;NG6Z3Q*xR-4}%wa5@CB7aw+@H!29ioG{G
z66O0WG!s)oW2W{)3P(kAkZS;wgt%pK0Vtz|USb9@z1^+AR!R!?MwVQ@z0l@ekrgL6
z$*rP(pQ2BRSsw><YjBB0Y8>N%&bm#PRoy&&mg?n*z|D=-5jj~RuS2Lnlv12biR4QM
z%4+7hK}-wJnYmjh?i3m|g*X<h9I|G<c?;oqm-UHNDDOf;^AQEeV-j!llFZWkoINlG
zp6AZ(hs&*)mfb9w5=Cy)>9~)q_1eYWldjWlVyg>^#(-9tnS{faJ`Y3$mN;d{cU`=&
z^laJc<T}mgF+cjLR?9qt#jr%0;h~{~M1FUBeKO<dDrBS|i7%tq1Sqpc4oY*|ufARK
zAwKK!Hki#VSZ7;wFY$?CniyM@EG;a8u&y;%sYA&p7iK}YfY)i|OQkNGEKvRVg|pin
zTIrgA4AdXO`c5H=mQao1hF)|SmYY>B34$3UjU1EDH}02EI^l36;OLA5pI|sU|Ii{Z
zbSp#?@*{r1we++7iC0F9x><u1Xm<IevIRn;<xie1u%4df@dq!C3VSUg;`U-1?pya6
z`SkaR+bQ+jf&8|R!R45GIOx8u^%zQ*n7tk?PVU^A5Q{g-C^6gAKB_E>|GyB=7{s*P
zO}Sniq&0wI!1lyG!%NzPees~SNHFnog@7#~>Q_Xb(&Jlm|Ach^?mXe%62;ZH)Yo9#
zvzvn%M3p*pUKUO-pT5H`2U*rM3zI^?i3uO0o~yDL*CHBfV>LscD{!;N`D=TeuC$|@
zV`@7UhdEN207JzqKd|o7+@yB{75k8zOhQ-4X;-(91&uK=%(3~lkoX-ni{iFNBaDeA
zq*lamt~#_`?T@l(_^aIuM%=;^Y*-2Kn0`1-iQ-js*NHJGjIuhj;&^YyXjo?7jc3(-
z=uLIvac1foXwc4wc+!lodxWF5n`(Wkvs?GXK7ZudL)kkh#%?@&+~^vhJY^`Iz}@#=
z=+X%>v;6_Nhpv?<ZL?x3qMC9NP-z$0gChI}ihKy0t{!zB$<ZTVH|u?lqA~SC(sR%l
zSk@GGT}1SEfOIEUN=LnEEeAb?Wc6)TvM{N~XY2u;cow)#I0Kj$^QIgLLPffd?_|X!
z{Yu1LGDC?teen*+wyoT4<*joBr2sL%#8(#ihn%=3P@_(tejz>7lL@uK<UnmO2_Y<b
z`1DTbBo*J&ke~Y-`$SB<s+^G|wi}8sQm`z20GN`o_&~`guX*Y%Db-FHmv>J9eX^>Q
zP4WkvURG<+_2RG}hE4<&@4{nl#=T&w>$+UjU7QlhZ}pm|1p!$Y@b_-)VN#{XQxVvx
z3wBTW_DRBF&CGnEF}5o<&57u|n4f@uV*CKKhK5MH&!Qm}HeS{ci(w3q_~84P&_Cu{
zJe41qKf$MJ&brsAg4QB7V}?6{PA1kV(P}*@%Ey>jBvKk5lf72sD3|zQBQD43#sli`
zjO$^WyZ-8;G3!PZ>2ZArn{U$Ht;se%!Qy2hk9ZZo&k_qX3s+cmPMwbHhmts}p?i^)
zTUuCzK!{?25Vca686>>(t6vBNe=H6D)G|1f^87YCJQD_ytxE@0OYb+MLRY}<BayGa
z*ddtr@at2lBiQf&Xdd`bl|21s42wQiYu{XC9_Jx_Hz)xXVA?*k-jBmSU>R*Z$CM=(
zDW+=2ybEvLJ<SKHBw68Wi<~@2lB$#+5fbDwois9jk!xuY#pqHg_#VufM2onm&e%d1
z*0OgPjq%hTMS_mbZ)0japoele0O}K?q%9dX4a?A!&WB-)sL%A${W!X3ZnVuY-Y@nz
zHWo>!`NSqXYASuPRt&;H!63u&45**XG@2!$86<%N@=n4w=#fb1A^pi1$V5W%(l=`s
z<YE_+ZqW-&z=D_-lWwgo+Yg1A9{D^;0##Lt{T9lC`TLx@U*5EaHkGcd#OfT&_~FmK
zW3yb=MyC&v(X_^x1xdY|{94b6yf8sSuR;R{==L{BRnC$GK%S}*@o>b(ZA@~r8>aZL
z&F>A7E0)Z#@RYXfdwn)ER|8&G&ufEVKPiKN>k_e`q41wh_%f8=v0hnwl@jRZ%mq0w
zgK}d<R$LP-P!#GuDTT+lxuKd7sy(xB^qp(<FglKBt?|(SVFkq!rD_jI#T<;?2+1w}
zPxtHUuP9#Rk0?H4AvFPX6V*T+G{o08K;tTCvHI2RDRqP42G1C*WZJ|?$n4<R6X7N_
ze6gw`22e8dl95sM1*hY4H)gQJb`B}b<hKV3uR@K-nz30TzZxc?2&rrA$BS2@sU`q!
zf_(82a@<a0z;wV3<$AeR_)68_)C1m65HU5tSsL2ArN-g<K(1HYNGz*p$57bK)Tk2q
zvBp5JayNgO_gEvKrMII>Y&?B<hSZP$^jWUMglAZgpl%f?F_-^1ZNvjGwpU#c6VLli
zJ)XydpQ>?=>~R@*y+&e^%`-h6Uq8r?0<EmH(Ox58L|R}{5#lL^M)Y*mPsUb3GvsJ)
z^_$srNl61!JSRagBliU)YejE-eBd?E*UP+A6F1?lg!p<h9}cx3iUl$aJZE$iD~J{g
zn|y$DO7*4PXfnnzd}VmbHGcM$X((PaE0Ph<m9FxGJ5W2cgrP21(bobHo8pdl^|El5
zQeowll1H;Z3A1k*(TE{1K=<v(-l-)T6-MQFvsmyQJ^l9JE_aRLYq%!RaFcpQrnv?E
zl*H4%Er%<Q3Ll+^r7Px4vXz9PP#QUeK&^~Ktq$>4UC{!`G@0CS8Q|O%Gur_s#|k!Y
zh(7e-lXRm<QQ{~BUPxyA#7kU4*wz@m(Q!_k;BukOjA2jwgu79NB*(op<FOD37Y;XG
zeDTIn3p4=oGn`;1#R%GNhHlkQ`*L57^j2Wcc-F-wP5C}_Ffaa{lgm(DdXbOo<%0jr
zTpwi9>s=*R0)&gv(~npp=%r3=r~y~bWvk=O4VYr=+0?}0vh<$j73dqu;+#=a^eHnl
zLf(&3l2-B;kVW1GiFH=Yw1t*;uCq)~bH5wZULNXUrq1lhQ+lrtb8P*wk=XYl+TJZJ
zj?I}e_4Ot-*zZmwrVy49QREVF%KVV$0ZbE{<|EILOQb3D9mC-uYokVfVzZ$f)VbLG
zGb8RN<!;zi?^dl1d5-zR=a9aTZc$zVJ+kmrU*Kw)F7y|U9-j1CtXC|1!rf-S_QpIc
z6*}k9GB$EmW)x1TME*-WTZ@#aMN^HCOkY_@8e~^X*?&SSqwy-66@}XGmw#f65Qh66
zAyVJ)^b{Mt_L%j`y41D|3V4g+T+O^pn0g&}-eANdj0nhvX*jcRb!_}ZxCej#;X%>X
z(v>Sd$GPnF$t*hiq~GO&n#XjC-e?4I!s_=D0A&QshPiRGg#Ny5*t3kgq7|^&uzggV
z;EswjDo1U3@yOUW1xhHsyXCK#{=1!?3`lGCjn*`x;+QygiMICBfU5>0Bb^^Y3A#qP
zgynl?Di*)f904;&HFC)65zu|upYi4@mx~lWiBhuvzOmcU*S3U){PXkbkGZfOTcCk-
z4nFm6rF||Fxt&iBG52Pdm?6;KK(P&n0{wed%%mtG8#hto5!BhV|32l=s*t;QMfP&w
zHcx5K%L*5l1eIRY%W}a!3*QeX>m^(Gpx;voCUxAAXvP3WL&8SeKZBX5;~X#mb4ndH
zTKfzZ1dq9JzxQaYiV@b6YsAedM_Mi6*vF4EqO;Ka_ZaV8s9*mn+ZuvO$U@56vs!2S
zXYbbCE5u1zwXD_Vl{*zjf|1dUmz)STj@qF{>_L=4QDsaZH)e>A7gh+<f#;jruGec`
z0LEBA9-17t#|o#HYN=JcP2i=8_iR&(26T5?htW$tZg@~0$i&fSkt_FXi>KGZ`f+@|
zx-x|MoJY{C6c{H-fRuQtPP{rJ-3OXF$7K4ehox3$T=DuwJs&1CE*lFUmcG>=R)$Go
zKI5xT2YSu9WZ9lFXA<_6@;85(%Aszs!{l&~OzVRn#nN@ILX+M3u%&m0tyJffszmEt
zZK-4wYs%Q@wf6kGIquS<1MK|yBUaD|K0lMO3XwwC5^#6dtFa6fn`SRuBFq-8GeaT8
z53Y+guVxvAPOge8;&9dT;z2c5jG-}w7)rR)HtNfB5qpW@0O9^NqLs`P@?BK$De}JT
zFTsl0*MFMKboyh~M>NLcV|ku=n*hgocdwB+C&u9Nb&gH^JrZ1!&KDcO-*4wR7*5ba
ztoT|Y*9hYKuaf{HWv{QqD?Fq)&TFr_?!Ss0`u0>s(8`D!Nyv4$S4?T)hfiGncbygk
z2YMS;%-=#sU}Ta*1!fPu%DB0;3~5W!hhBksw_8B$fuP6iKLt|-j~m4F!f7S)UE?Np
zjE*^kN}dkagWLvAUwb*|`7wH%WA<C}aG)Us{WN?pm0pVvaW*BH332tySNy@Ze0jP4
z?gc=XU$MD**ObWIJmK<J0^CJkm6^zk2I7aP&WLnk`+R^|bp-5}g}dB_GHqE*D?#^{
zwMb@(@XqBp?kU^^0&o!$_S-P${2FX2g9eK72M3BZoYlYB8NT2+cZ}orG&K_16>7P8
z{+#39o%8!OMbCcUsgORHUEweM6Ft4W_<nfCrF#aRXlf_&kPEQ{H2SiP2$>v>hhmyn
zecFo9bRV4Z9OAde(7Vbl)eU3axtI8!=0@kS&S&o~yHesV*NEdM2C;Ip$)o;CYIY5M
zSI~k}y8iX5`Ud3?IAzb@H$<%16xIX?GecM8`+Dtw*=CTEeg@409FnZ9qA%-FQD6WA
zE(A;^p_yk+T)6p7jSOZF=4$mS*-`bs5U0MRl27QB{!5UK<YZ#f1=BV~Q@MUwicSR&
zf&27Q<{o3i8y7(KUAV3U(};lb@c%X6bKj_ObU3HpO)ukfLf;mJQH66p&t}6k7;}J>
zgyp-m2P{ZfRKJykpM45B49OS#^jk?ZyGab=;O_p4BHsTRdBN?3CVD2eV*SQ8ZV0WX
zU|cCl-F(ennqXEff3h6^Ti@CWk}5glK1=$oZxs-a1ci}riT-U*e9zLd*kQUxfT40;
zKnNE`E`f%kxX&{d%4?>3J-!P$*l_Su8)03V8u6Tu9dnkBU1&11Ta6js1bP8#RYt;;
z<a&av@Rw>M;D0U}Bz*lhGQiNy8y#Y<>^2DK$;FxMRWRw|FKh&fuf-T=#mmk3i5HjD
zxEda~2Yaju6M;tD1C8W1tIHowHcDOsg9>4Vy*)#n3dt1rw^>#f{A>%2X#LGIy;i~M
zBi+AvW~D)K>KD)SKs;0JcvrD9v(IAtODQ==DHfhUf&kOiexXi#@8@{6P~?-%A-BJD
zb3?}`Ggc(izwG{`VCKGPs_O3x0?6s1NJq=AkL$P@gLS+gpKiQ=se{;DIGYO^a|382
zs>-dI<{aH05V~@U?dg)d4$VG+{2(9Wh`b;`XAZFZX;d%8kp)zgxl1L`<1U(d8;Gmc
zj=2jzY~#m#>^b<=CI^zwWI;LAXq?`^33YeS)x{5u;N5SDB-3984XdRfS2KRjVXDw^
zO9Cck(;#jpVGem6<y<cL70Ns1W7_329|mjx_PtED6`6tS0gcBqKc;Iq#Gh?M*k#d9
zj}d#2$An%ytl%zn716wUX%u=cbWhi)qn!5y#?|=C6&1ak1O~T;Q+~s;F{koC9y#zY
zC!`d#4G-ZflY{=ZAslt1I75q%zC<OUk549<N%Zn%_A6h|!+vB&#h)z`B4Q9Hs4IHD
zZ>}+t-MUK1F~8CS`qmD9Y<9uqFaTWTa*SUEdEzH_JX~waoztb+flp{Mce{5h_R@W(
zZ%dTp9udPDWYexr@Qk?Y{fxiISo|d{+#PXdv`A`y2}|CgwZ{g#)vRU$$7X2`;m+)*
zSO>&J<%UBO#%f=ZzFouF^tpVc<xIY~)%(uQ5;)+0`P@LJ7nw55&iQVj44|!Y`wrU|
zkh*({7<iDI_tNq`gKP|wJH8?@Zq~1E@(pIo8OFrU{fVL$yW4sTd*Wca$DK9jdOp<D
zl6&NoRqvE^F>*E0ZgeahB9P2hdUWH2+{$PT<q~An#ftc<@_%8L?~Xl~gyal|Fk2vD
z`IaLj(j1>&rmR@MW3pn((j02ken22wU!N(+6S729Qikq!HtOVhQ8HLe{;gg`XmiGs
zUBe{q$f7v9vZOG-IsrU0bE}@BaW7raW`(D2;GDUpF66QDDg8??%Xv{?WYr}MpdMAp
zX3)z_%4J^ZFs3zzuPhe2bMZ5xbLh+U0`UN0GDtwx#W{T$C1FwvHHuhxUYa0`ipbr`
zu`<*-s8?P+YrE|3#a?fw9%bXGMDEa70e<UtEC^>XbU8fCe@&1*d^{hoDk$S%S95RE
zXeLiQ<tE+;os(y1vU|QpsCK~4Sla|8q}t4r;4v15cI0N^P$D?<YLRimU(53q29_FY
zNj7M}7eWHUUGCkv55jim#qKJbB}p=9kSVrK8C3r6@LdDaedYS`ZlEv$<e-i8{HM71
z&#b@j^}n3@Yi}R=(&5Ccg{Opqb`UCx%69{&XT<~Rjb2ang|_3arY2?+hIjaRU&iW8
zMVKQ<&b75Zkp1>lx@0KUGUz}<(a9Uy-kOu|n(jhdW7CBVY_s}LHb<;d^UvR&=U=^K
z>>V=zTA1P{z;)n*5^0bPj51yU16g~IoInA91)Y&GkL~J&QsCDAGu(Aoi*wuSbK76D
zmshw2+m**Rg2!beJMr!*eP)&Z6{UQx6^x9=2x9u=Dey6Gm*1*i<@mIgb#mA<S^pdT
zw7S{A7rlhZNz_?Eo%4nZr}+BQA0x8^uE!d!F*D!A;|#ubM@w#MDbj?m%s9?5``&Qf
z=5qTn^=ddvKDnyo@~la>V?;a>9<vc_?~7&t`h7-o)eI7&A4wQ=Ny=uC!@kBtzK>C3
z?g&hd=eN6^J@u7A@d<vx+}P*P#<m&<gx-wn;U1pI$=#x=SN0bMo3kx9#qy>B#@g=S
zfq>)Dqg%wv!8Sww%Al3j`^fnep}!sN>0r=5HUBPj97!sF*8jmKi36}MwG0SY&vdn&
zRC6~-@@l~v_PZHf`IK=~h`{7uyo(4->iM`m=rRRy2AID?jbFG0tc>+qmKp3T5(K6&
zeQp3OjPAGlmCmQWuXe1G(r9%%sWz=IiKB-|`*qZjwzWjU-t@M%zVg)1yXTX4YunAn
zr>=fRx?j#p&6dAaJ}+P{B8bZ~a0ae>`{yMvx?n|}=6sZ|WDs{Wf$6%NsAmMmG^2GK
z-sy}FZ6fCC{JwW_;Q|jkN+sM7Om@@S7_Rolk$-C=)~Sp7;!(X4lN(^SO9!T+#CUbC
zrzA{RTxI_3W$LqVK$gBJd;k2qsnO3()#N})5G0T0u7`Jfc7gQ6+&eQbv}I^CS>Pmj
zg9VA?FH4MxnG>wE5F%bs*5&xKHFu=wif`hK<<T(RtW)X<Le$UFWRuiQr`;@XBV7Ut
zFI9{|#CEJ*++Cn#;MJ@jYpI)3&r$rdzGCG*_hCYZgExocv2?=e!shHi&Cz#45<kph
zqhn%<3Y~BDnMWG-r!TMfJDCGnU6C+w2FX}J7|>^5N4^Q_DE9}t)y?yj#<VR!2V-2^
z-gC|J0_H9my*aK@{)-Kq)ZI4(x!kk88_sox9ToeQ62%P<YCL|qWfXCX(I6gia?c?O
z5Ke}2GB2#M3t(YSe)Ct?Jf*p9G<l;%S#exIzUR1S9?&0CH`bV}Xd>Fh$oTqEFz?PT
zt8eUuz#tOkHo{}XjtnkSUgUO)c*LOgtB)FaP9$HrwiYxN(~eDk!VlvOK+w<FMAe^;
zTh*PTrBrnV6fRSky;p}gZWGp%r&bD7rw=;5q#*apR{w~qp-QVcMVFF6SWlnP>#{hT
z-X(uU_2CN<+7iyj+v*=tfKN~%k}@(0X6MQyzPW=K;RqU^BPUSuLn~wZE9BnQeQ$C`
z|1G(~Ce~`YIN-;(Y@?Z)2|J!GD};b+<jDb*j!>)eWxuadQ}{ENey0BPcw*Z;<CK2(
zg*9N#bn32bvlC}AFiVo%i3`*VSWK9H#Sx59x>V@kq6{QeZ);yx{yf>(uKVnW7#ZMx
zv~zQF#Isn20Vp9)1CZ3{&gtCm<ro;J4i#8LXDT(WME#%+CMy6DL6cjOOD30s0pDP6
z|DJ|@06eDVaQnchLNq#JmYt(HP_OsP1K%4{6|b<zSvY!1ewoL-152ZyV7e#tGBd@)
z42&>m93kfB&W@;4KS_hI^S@eqr_&wAi}WhG=n}>!%?n?>MZxN3<^q3&XuC%7!Q<0@
z{jCu3!u@V-vm|$mAN`GomzT54=UZ`eh~=MT#Ri}7wM)qBQ5Ex^mbad0C;OE0H*e>a
z{1oMO8g9xilky#v`Ni7Q0Q)D;b7$0oz=+xfGdDOk<OECX16#ANFK|JQ#b2l+(wG96
zfP3fCWp3CWe9S%t1{WiW9@;K<`n^`97L5TR_KC+PKxsh&vGMygp`9_K*K$kb`Scl3
zwlUC}L%9s(0~|V8Jx`_1!KjNq|4!AX?Hcns()1(O8c94aH)sTNz~e=}C?yB-bUK&n
zM`(^3U*kvV#r|pFAWJwc$J5+(LMnPNb6TL<%~y2%<YfHI3eVn#?^xH<wfvW*kDaH^
z9TQsz!5;~k&2Y&6<uurWblA{yX8C8T1|)S&Pex01I?rFt3M^@HyRLQ|uTSA)A_R<e
zH0*d-6;*-9v$jMiZg9HXm$`Lt^4aO+HDa(MM)lQs%zSdR*yke_vkKtS7^g)0L`y`{
zMOzmZx8J3;$uO2xc+WEn$eH3y-QwGpT@R1pjoNn%0pZ_~T0U1yrvhX3DFXDd2*o_e
zb^t<Zn$(~yB?|tEk>yYgqTzfJj3()VSwYWjFsi{I7$k=+V{LjE(4u~q37Rx&t9xgy
z>vOZ^3|q6GTi~@f@q^w8sgwVYtoM$Fv+LW2uPD(XYKR`4Xi1dlq6Z10j~a~LdnZIE
zdW}K!=#1W@j~=~@8r`VD=)C7#_x*g&x8C;;Ygx=<=G=S#%JDmnW5-m_(l*ZBYzHY-
zS|RA}Rb>eQeCV|@3a@Yl&_GiNU+y!YAs*ljv-2v!zhg<K&E<-LC2V*#*WaVcb$%tV
z7oblTa|((3JemSBr=Sw^u4SzNcParo)w_NGM!x3Nj5tNPX9(v3>EiAh^>dw){27#k
zircj=*E7I`L*}C{&e4~-uIyFz9!zrmDyGI4t%QH+Z2)eLjo3u%wdfuEXzoV+epB=F
z)kuK#kb{$2(V(w@X@Z#hNpnGV`Ndtpywp&cQGM66OiEKtM=b@(5!+J7>Z)5AP%H2e
ze95BzH~3xv!U0N8CmdH<@a{}o7}skPA3ZC$OZiyW*KMDCnUu!1->nGLr&0r_3%gkl
z*o1r)p8N6$UGx*ClNq3&x1(JRXMHOLrssugo4W_4FKx{R&i7y3b|i}7I7G2As^hmy
zY|yT$81d?LKh$VENk}}I;^$>J;z1Zal%?PGK2T8X&6KJ>^|yNuuKYfqYToz6oSFAV
zc-h|Ukb@Q4YGb}W3=l*0x`qY#c!Ro=IauXRO$;WUsiee?JTU}l7_Pi}ZRG^Be}!DB
z$0I8GGR%?14@ATIB3JY4=18^`d|thNo*Eaej=vvGImH$Q{CBAO$kr`_A1uPtHA}Ut
zz))P}vxio&Iat!fcYHLY<ag$7EFQPs)RB50B4Bub#k|iUryPBqTg!sE@jZhQnvPJ1
z2C!bj0D<WLSu~A|+IE1U%iOtHT$J^4I>+bYFcTv+5FM*fts_4~5v-tCmNq+fyQ=AP
z85EC7(aRipv0gLRT*3i6Evyx{g?_h>_5R6psh9aw%mO7%1Y`@zK=xo{1`?4eOF3DH
zY|7?aiE;09i<^=f`qk0&^Wu@)WB!}tY5D!Ai#cnXhW02<Q=5Z;uyZ4bUE5P&$e>j@
zUAYKgsDLn(&hPA#f=>FI0<lHv`om^KN>B3jrxFJRfHFq-*056bAiI9EKuF||OK@D?
z$bRpei;AU`ue%=ig`Q1+Zqnw<1U`kVGuRFvjn@a>Z~yLnK%|n~zMf5(-_34+Pr@Bg
zSAXkfI$|3Zjuj@Gk~tFX%cuMqcezVW9Q5dz-tb4ef!@n<vUp6%9aR8J{cuCn=s(Gl
zVyl-kD)}@?{+;OM!^rSf@FBw3+|+k#Q--w%8nw_9i#PJ*5$hlTsrF~g6Pe4UZ$5dw
z0j(!cv7s`hOK!4!%iGdL?v%?d=_}3FeVhX>vtOR+BS#~jG#`A=NM&4iZmrm@&?|t_
zd0tCaGC^B`VhwrG@vv0*F~fYaNTmUjOy^hl=PvpPK-cp_BLW}`!uq~ZZPWY^T>XMx
zCLbHqfWIag02nPSK!jXu!)V;A0GorE%D*oaFutSwToQjFsu@&C(G*z9QOSn2@kRY{
zfPb~{dTlNqhX5GcqV&5d5H-8SY*aF@V{7d8B;XCN$tarR%UzC*7)vj=);p>d8t69t
zW`L#&etAh=ZqXsjP?-$q>I^H&#evGSc>zF}%Z+qTDge+wQN?;sVys&GNMdYjBm-fr
z=eS#Ed)H07L;Ca+Dw}8k*#yW2{j@&AB-(SfPIAlQ0^rNU<ZgjvZ}BJwJgU`}wE2VA
zp`7>YMc0OJK4qLr#$#3n@iGo>SlAwmv@EW?>NjPu^n716PFH(+i;r9#3;NRqjFs$H
z5V>kDbM1`pY$(&cfbrx-+WaH)nXH`3?%ag{9rz6rZKa!SvHeBZiFtkO5s`XCH<@|g
zKA++XNtmESuJSG0B2Tzbz{}Ggv%Ul*qIo40xg(V$`yxc-k7ZNnZkB~StAh_7@phY*
zET@3mbdM}bTl-K(CKAh=GAStGxqai>V-;!;(8{zqX7O~~LN6R_`)NFC4LoFt-`?Dw
zjvWM(C|3cj(peN0_U6#T3tk6DnOY13ZZDK(02u*%_p%~E=0^ljg!2)#({iE7zWAN2
zfB*muy}!<D7{R<Ijv}3A$(mYLY(OQ9-<vrQR()~_yJBopUzt13tf(!RaSId{*7iaU
z$L@E!XdM$PnFcR|%=75v>XjIjSXxA!3Q$Sn86b*mRMJI;>dKH|aj+4d*6<^nmdWVu
z4~Uge4Vo6vpaG-(|6hal14_Rhocv4+ru}{xpvMJ=GE9;6iyoeb(PurzW}kawm&Mds
z(_YkYM@WEf;Ymup6LxN59+V%_7}yZO8%!i{+r0(Q=ZA2XL$&di8CPD_JY3TfsgKwS
zJ@mCmQSEt_LDnRE6eP+>X*ox!9<M!m&DMxRM@p{mcf6I;rbkpUP2lIA(((GJqBDc}
z8&?y+qR8pQUfC4=s8jvhA^cs^sVjUNBk&HCF{{v$Z|80seAzknmHOD%Ylt96c1{p&
zlDO-@e<dp15j<*dEX_L=?>V)<@nCBl;JZpMldY;PHF(!*H+tRw%P%_7kxj7Yc`yX|
ztLY$@EQ<$zMf(Z7DkCcC>azSs{}deDuK{rSpNKec-M1Ig2}D;pdYUlf7?Zt@W(=50
zKDS=9*lD&!3D-OP!-+xRYJV`|ffjdO%h1d0DA36FBm`(6{jx5B;@GGKIP2vqbpEAj
zC{qe|fZm`xmL+xoaT)zrz>o5gfmiyUbLg2wFt=s{Wnzc_Gi?Qi-!j0!;ybocELk7}
z;m_Ip<TxVkSiINGnOw~~xy+qrnQl6R`8I$*<z`0M#6Y){IN2EBJu~h8ka@muApWX%
z=s|Eu+^a^6l$PG3-G53wQ#rUr003#EkF+aI0kq{1Xq`$f0xK+8e?N=gEFD{Ez#5~~
z{c8?#UuU%KwG$2^1lrnrHYJ3?lxBR$WQk|l@pj$p0s+Toh=!~_DB$jI6Hx265ivxW
zzJrym+&nb)5bTm}b(;%ZDIc46V%MDk1b<)u4|;4xMwi)Z4oYb(JxM==;(PC+33WtU
z{4&b?&=)bun6$B*#1_qkGOlOe5JY<VS8<(*OC{7)!GeBug^FH@2Zah&#ixsHJrJ&8
zjpr<<$bjcLRk4C&5QCn@ix%eL)UOcJEH6G<w$%ISK}E3!<$mCdiLadAKp<?-8DJ~Y
zW(RcGyYA}me&88~?sS(7;s}J@(02yS>QfRzVVhFe4QxD#6#h5IQk~d%$j|RNaUwYn
zDS|8EvmH-W*km%U&`)QI!Vq<a<tla>im3j6D%L&;7W8%xP)=C`P9%IfSavgIAa}q&
zg>qbJ6hJ*yJ~$zZB<=<PeUb9|KL<dR_qze?C#+G%Aer8K#r1%of2MK*6}kUoK>A{L
zJ)N*A-xKmTJmZXe*JbfM8;j+zE`_KyHo{~92M_T{C7CGu9cAJSz^`dxYW$G)OD%s%
z5I-;s&jjy*wcer)sNmZowklh<?dMjOy;97q?COz$0o8%sI`<v|K;?fjykLL|;qJf?
zUpLmGLEL}{WU2Y8Xo{Gz-L&**iRhIagOdBr(ljT419JT@tl;9`v81JIRgf?{7=%~Y
zX06aM24#U(Da3@~^sz`7;%`2Q59&GL#*)*p2iW{UX<d&+X-PkYl4H=9zcOD3dsAv`
z>HhUxl^s7kwGj|{F#|}+20QaiJky=AM>hX{cP!T<nXTDgkK`OCKB4TgguD1lT{yeZ
z4^w*Cyb_f>M#I&OLzj9&-#vKscg4yae)E>m+6}|V{(=5hUxH-IX8KJs!_@O!>U(gw
zQK!B{umn}#uIt0Hvs5<6_jM;xtRvQaN{NP;8&OW^zMXl%Pm6v{c8+i7*65{`^2{jn
z*hghAQ&|*zC47*!Jm5LN7-B2rakq$B(<Wpk&nu_r*q1&PkG6cn0%Z|mdvmY^FY`aw
z0*>Yfv=D&1HT_l3j_6xNtq5SOWY>RH<*GYWoS{g9%vUjkw#!XqxcU|1<cnD`w+5d&
zu025F9P9fiZIW8`!!dOUfL?(h@`}q8zz-Hs-3~4=t5ACUZLw4CQ0QR-$e~<(H6AJ`
zP_zJm0B5&B<)5@=Q^Eqo7tQ_W{Ga-z;}bL_*E9b7*aDnV>q?I)4q)Yo{})t2a-9L7
z%A#B>?Lh=bCw|I4Nx5M+(BIjXG3pUbM4;OOG(M2pfX-xfefQ&ful`oLW<fS6Vc{MG
zH{@U$*-CXcHc%~jSqyM-#i)?jSQ(Fr^Rap{?`e~dmy?y=mwcX)>ne%2D9j?$26wK^
z<1mM_2HTZ_gN^j2I=*~Opa$<#HPXZA6@zh748EZo$_78_M%wyx7#T=%a%fvPud5O~
zIbvu{E2(BF%REiNjq*2+4^j0D{%Vs{&}L|-A#w)U1J0U;1?fekBEt~MXo$MR^K|5`
z-9oLE0KlXG#JPh6QJlBYHh}ez%-C16%i}eaCkA4M>)StpPI^BPGC%GyaG9ZT<AF>L
zA3PlkxN;T1>?W04Eg!!@YMs^Iur>M~KT=9@XRK%#T9;y&)&&kx`9`GE3RSG3S>h4X
z&rne1ebhVv8VPB2i+f)AXNc$c+5FJE$f_TA;?X(66oStg1D01g!?&`ivjs4I25t#q
zA23@Opm8pHYuO&Q12K!Ix>S5nMkoNg47<9_M*=`Y2@7=9ZoezAx;n`1Xn6_HWNW8I
zJ)4FyK^=eo-u%5<cFghm2jehCw@S4;3qdGK9@Hm_vou8EC9MzjFqfbyjyBa`a#T>&
zf03<J03s)%IsWT-^|9+*1-Y?P3Y;ZRyEr&Tcegp>IYGZEMl9VE2|wpKK|JRdHQ6K_
zxsmu6jPo%;ch{x$?N|P*a$2%X4J&=aayLzyC7KsO0xI-9yqtpw_PzTbueu$`TYVrv
zvCrOK^r6ms3UMk|p^ebANLHsp4J(|Ynuh=rw}aZm1E`XZ`74YBJf7CySzd)=bDsew
zuPwlpKEE#m9BcZH1{#*VIn==PDsT!5i1Gd|Cltou4wwhbqv0hH3>b8z{A2<Q_5!=Y
zIX^(d?IJ~-rv@N>^dB!2zL9~RS}QI02Ny$L58TW>#Hnkn&9$%H+<)rC6g6;uDaM61
z0;x@q8HwO_Xq9AIaYxx<C<Wo2NF}Dpq%hS+dK@$ky{%U%tP2yS_<JsGw|P`GTxM_7
zDltvl<cMG61!3o>LQ0t0?wCW+4}czYT4-x65;cT|19VekQgB`jKml;E7^^L+ysGfK
z9F#5ywMF5!HLCLawm=!Rv#wb8{+gR1sh@vV-?ZhRK7+g@4<ZYw;)q0ynV<K$^@I}v
z2!c_Eb6^2<?2SEs4HI;g=SHmKkHX)?DlOBc=)3-50USPYlt2>i4Q><-xS&*17cUYI
z8xQ+Q@)LlY{6li1xE>h|Z#Zlg1WKY=?8!}E#L1|Wm(5Et0^`mQWY;7(>{qYkQ%_8x
zK*kc$1p+LK{QyqV%eTf;$2tt;ALbhEm5dy(+%f(4AHAqGE~>xVu6_Oj=i^qsPpjea
z%<kF>IVtF;h~dY>n&+|}R|)17+AaEC&;IM!xIe9JREHAOhd!mT7CGAYrGXDM?&L;A
zQ*-(pHWm48iyT}jqU2F6h>8~5<JC_0-;!8fSOyA~N723<D2IkjKjge&`slH{*!|_Y
zSZpd__%c1$PcIK2m#51t>uT1;c%%H|6~MIjSe$nQOctr!QtQ0%?_dFopCyb9*0U;D
z%>v(Pv6Q30DFw%LzJW|GUdG3bsHocJqwVevu(3lvO+$o%(NGQY(kvfg+i>ph*ullu
zlL>^qq6fpaa=+s@aAI>ma$1wTfLn-L+G_^E|KS24maBu4U}~Ahv=#50tJ7;g0;B?B
zn<YeIB&U%dvSP&Zo=3oL(V@)|Kv4_>0`%O%Mz;&s!#R&WHaBP}Ul`umHGQm8wcoJ(
z@R!>}9X&8)gj6}-z8Z5En#To5E7f}=IKug!V}Xu!*Hm5=+cCnI%Ph<kz!a19<NhfG
z3>8I>Qe$zMvL4FQ<C&HKC8^gm0m-vo%6Jww`&KIKhvZs^pg&UlM-+pjK|tMsYshX;
zeR_*cT@lShT@t~WG{DbP^H4{p#$8e|tV*QM9Rqui3i4s?4m&MPvU%Hu^Cyv4u<0%x
z2qR+<c@^#FEKP4Y5YtR<Gj(;SkybQS$EkvN2T-HJt%}@I)CY3jm<^JOY{W_LXRO>7
zY9BlG(^yd3Z$8+R*J%}n(O0~Hem3oosWa||?N8Z{a+v3R6ZqScPNFQ>i82Eb0qCR$
zR4Zu`sa8;}9YDT>vYnmZjnd(_s%2HGQ$&$aeqLr<I89vN-yEf4q!wlU@ik~68f9Q>
zd_o=01fNIsbSnV?Rul2H4gZ<@;nTeWkzs#mNAx567(32NCIU{L9U$aAn2x@wcQj<T
zFS9K7%y+o!nPyYjo~&`_)&aT+LDHIlj|tyKbaeq@Etz$>mx>N}ceym%t>(&9Z>+{k
z02mDkI8=x{EOpxC&?6Rvu~saWeB8yuXyn1^uKk~oTc#Z+xCs8wPVkK$!ADWLHI);V
zZi^m7SStP(Eh}EEO0T879lzH?!~mfl%4KW;z@h-B!iBLVmv_B5913E=la@X;gTo21
zAdg3DbyaN(!~==p;xXliVI_YS5VsJT!}cFAsTYD$L9!HCBvBaRo<l14wA)u}HVNT)
z%PlPES)A0H&I21>>%=cPt+XCpM{gXBCOC#OLK~e2RtD!TmGs2u?C|<S$(84h?K^o(
z?jYVR)tmGFwy^z@dVO51{vG|Ux^<ilPQg$>P(Cg%9Mj#)*7!6P?9`?kW?mfruL~eP
z-rp5k_c>!jY@XoQY3=QXU<#qxW4X0$z&|pOt4`y_0b`4R+8*D#;n<}sJQGki3iyjs
zbS<>5D2t4c*jJ%Ce3Ym`nZ66Q8$<{|8FV5Si!SAcS8j=QhT2g8xdng?BqlYK08$O$
z{%43DSf--ear;+E#Zl6B?ZLkLOW~iAJ(*N`=>>HwmS-T8oP10+1!qYL4>R+;v>SCY
zKjl@ViQFjp>?}SzDCJo=y_E^MDbY%8GV(Jf$Z821$(nDEubUgxTloHfP>Tg@^tOht
zhYfwcoCY7%&%M;6qWS@~B?@n;2HBJYbYsPe^Fv@nsDSN?7$0h{zaFjc(z4La4%u2!
z1mKQfbC-&5r`*m#3XP$Vclay?52&6VppPHlJ6ccAQ`(e$BGwLCd-r|v&z1x~&~Lxe
ze=gog)A{hy#iUU2DZ&1cIoX*Nz!q}kY0^Lb+}Vfux0~spfawpKnS~_yYEXsg2m>!T
z+AKjp)YXspse(+(swrXrj!VKQ$<tP`Jhc|0eG9pFqIjWrBCl8ZHH*`!ZCq-!WQusu
zjYj}WC#Nqv$T^@g>XFlIJ_cHgw!8Dnfb;v2#edemHvK|jLnz;cBfl}sB0eA)!T0;9
z9QdSyoiBs#NJaJLEIN5Z^f_)H0Go)~PkC|_Las#SfZ2_@2y)KUZplq#rk?trCIGM&
zeZ7^Wu6**eXHe$F5r#7l^cCm@Eg~aOfg!QKA6-luGUeR25!d6=Ls;;nc834fGdKAw
zZhG61iX2<U!Th89{NxP}Chzqo2sWr{xI6vFUCjaMP}F@5R$V3oXgnL@D${%D>mn2%
zqCk7_IYCJT+fmx2MOK|Q$KV7^@A7C!a5PfgL%Ds`vC$mUyh7;1!lIXgFVW}iCs`jh
z*fN5H5SQFbPAbnoaYLH2Ph=&CjZ#RIB*}LKE!k~yCECMdhnoKBBaca(_790O?&&=+
z(^m_)M!NIf{Eqc&)r@xjGgZSe3pN18?FRtZ&*jzhA^R{_ErY9O_)XSomx4;GdW`TM
z8{-E#W>fV)tP)od9ylbCC^=KO3hgZL@R^hE-6s22On|v=!<{5BsoF8^*XszRaRRT;
ze4aRvpU;SG$lgDkDR1$+OS33j$vx;KF7bj6U)zR?dDi2VeXbuwlFzSxavFWwSDsl>
ziWMkGm0{o&Ba>3`+os$HDB`eR`jP2Rc=I$SvHpDWytV}2@Zlds&8tAeJk+$NU#-q#
zT<l_Tz4v1*bmZ;IVpQ8G1lxFXfK{Zcp9}DwMFK7<^D}x5b~lHJ2g70Un9!+LN(@7$
zb{U&S7kVSj@%&Q!rU%VBtGO0;brCl}=Mdt?l0`JfW#<B+DHVm>zS!6Tr%1!zq;@k$
zH2z|QzJRI{XkqhQ*_qqb`|)3DimP<xXaRaxOytvJN68PyDrhb9W1C$o0p{l1HJ|lW
z*!u#9c7D-fWO0rDU%NI=y7=v)<5$+&;nUM7XHQZh%)kP@hL}t~QN0QvzoVQ#M?#1W
z0H0SUHq(wfWDP*28LepEdkF_8er)}1>{-0JB$ZaxO~4v%9EN#XyASvr*RRJ+`zJCv
zWE>I(%8|g@U{aj(=%uQr(&C(HuLeY$M)0q-%@)m@?(^$|-64xR@7JfUB)2cKPu2Au
ziPY=ulr;Lyn|bw%iPVAy{wW?y_m7^6;aZOI@kG+wRs8N*z-K*;A_k?+5verX-MWRH
zyrfp{ddYjErN~&S6~9?npC1*5sr)Vk@$H7sELCZUV&h$q3`2y`(RP?qQAv;)^k9US
z!!Cd^^!K($8U+qb_hS{b2M8kSZE~F2%qMeX!y9sXTmz!Ca5;m}FMo*180rao!kNMk
ztm@$SCn1<>5O5Sb{-dD`81cq09DHSiV~f41;!MJ|mo0#JR<EGyd91{JqtjnTOTZ+n
z&93fQoAwf_cbfXik#)cH6nP&7IpWm=H;vbi(r>o6CEJ`0WT?zHatLr4)GikG&*AFT
zVVwa}pr6YhE`+891X`|LfeAeG&l`l~ix`+%?Do&-1sn};^7u4lPT1g$0O5U<`cFE)
z9w&yH`@0*TBBfb92Qi1Qn}FFYQYOujcyYl-FIohsbAvGxvEVZ_RGOW`3*B~3)nuKC
z^l84L5wNZ%^H!@@497;G&H0vw@^1UHN@Kf0VH_m}Zbv^Y8M2vR&##{)j%VGH35U<R
zBdg#3BBDLSp~d*vm6HJL>imR!Yj})VXKlOz+dd8O@1>wB>v(-z<?;6+=9T=@Wb6Cu
zmMesvy|k@ltrOr6^Ze+SXe}_{I%o?9Fw6x$1iZn4T~w!Og7E#ZPAA|jw=au~1BkR~
zb@{L$MrzhKX%H=5$b7|lZ{%?upo|Z8wEcpmV(Tc@g_k8TijH=K!NTbY!T5%MySgQm
z{rb^*lG<-2MmBCoYn<1Awlygw=OLW)Il87cm!icmDMA_<qBJd=2j}~Z%pM<OAoF;p
zQ-6}9cDk6r!sfy@Ha7hOizutw329K!aWoDZ(AvK?^L)*%jMIZx5uEJg>BUx2-2!w6
zFK%Tr<GOk~{PY^VQY_+`-yZ<R2gn{WWk(D#eaBQ;OQ(Rja7TB*R>#K})>S_$jPP9R
zp8NF^;QN1nYyVXx3m`5aVOOQ|tzHK!NKKE%sT9M9FW)&XE(`MZCq0fWY{%5A#!nK<
zePWjZrZmg<aw<S~cSz7S3*|SoDIW=|6a}l?HQ)XE^`-J#)hBrjzaASd&5P;8cT+9c
z1bW1aBuZj2PCqb=LnKCnB8~tblft?8@tA8tBhJxvw20v<%(q)Y%O^*;naXO;3@nX=
zpFVZH2jUJ4-~MB-pUgjO+f?$;Up@_8i-=v@9TO(_pS|$(;$XMoQ4X%rtK8Q~HA?WO
zGPJt^u={_q<U1|N6$Ak#Ikv&pTvz1%(=7#@Sspg%YEzdm+4^rbhBzvoGCP&Z-{K7F
zgkoI>7Qe8JM}X20_xS9S0jetDP<{qn)Djo>?ogJjc5=X6u1oA93W9@|#XLHf@A1_B
z!(X|_0+Z>k(dR1nWvgN3T|WP)kccY3VtL7_*_D$E6je?-Mi-U+HX-USA#8t=#-{Jb
z=G(i=;W(VOt0b1#MS@cAiRj3S?z127^`8@9v|0g$d{rfmQvFBxhBvQ7&$DW+;%pmq
zDSR}7#By(4wRp4HU&Uk#W9%%!kGs6b-*T?0;4y*oc@(0_W5{SXep#GJgzoh4rja_+
zS&R#THiL-;B}g89QFu*D|D5{dvk7EX;p=hGEBTJb2$rJMT(noWLj-!;ENQ_K-HZ_h
z9Hv1yf~=9pznta#mIf+5PX_@OPeu@e%R(d8y`h1;80_@({rO9?XR=@?E_b{g2fzyz
zfLfkpyfO|-g^kP~uXQU0p$u8fYRpF>gX;iSQ%|WVrAsomcmc-s*ct|X&`4qdww-N=
z0Pt`ZhrBDFt?L~h;$6(u%)XI+tp~pPW|#Rj4KxVQ4R5)<0WhZx;{qb}%@mt6_%Ajf
zW&C?a$quE4ZwWc(7$u7>%m@TT{v-?QIiB!9w3*(%Qdv9gx#Xx$S3W~8tL$LG@wsFr
z0<mGmV^(9Xug>D<ID&5>6nt1umYUf)A2Sn?vsHeF1+ijL-HwJs2xFdn+P&#q{vyJ6
zWg`L6YC?cKR*nK-x@2ScPwzv%)rlg6g8}=-z|>%F#jYIP##cxJ!JiR!2Q>uVoK5mI
z9i4UD@&s#(0t8bC;Q+glSV4iu1Tzc9-8P}(p<;c)kUH<<pTOQy%u~*uMqOv%J)r>t
zs4faP8-`#f{4JU2D6mRQjI-g(5C7;>nzU-zpPm*6cRC5staWjz&wZ8<as8wUUSq^~
z6#Pu!?`xLY#>XuY2FDn9LHLOuSuBhhC2&Y+Q)AIPVVR_n#2~`pIT!)eg`dc=G{!#y
zI|wVlK8+=(qAgga8{IT9eDJq@F#4<tC$2wvGxzW#q61|Kh;}3Fg@?FlF4b9#*YpMD
zAKso6*1_O#yuNg^+(^qh@T&0#)qy4{ujl71J>+=lfRXBPlTuoEILXspGEz`u_Q{|d
zcgmYwTD-u)cH;B>gB>V9?rVf__RydrkG|!%Mz9d3;{GYyq&ZkWvbXWDL)*cIuJx9#
z@^$IaxxAhUf65fJ6(y%@>OV6af``Xc909aEJN1Wg@lzT;XDq6wBD^OW(a2{x&pJJb
zTu3oec-kG|jVHRED4giaBXIIQ0MBgB0|Wq^it3+4ULR7<XO$&zHR6wFsh_*{Q6_v4
zram^I8C(<Z3SP7N6CFTO@zPcSFt(fp-Bj(yev)|l`^0vpml~U~0qBiQmbxfK(wM}7
z>V_9+qdU(jXI&ac%SYCIITFG-%jSjjcNfOKM<X3}w$Xd_Le4qUuNUJvpPsxX0e3K1
z0&#}~q+`lh(dfpKi<L_=#-fdTLsd3J<h_9YxSsBf>)Ke*xfH4C!TyuZ*zVA>bWjvS
zmOeFh>gaO@Z2~O(uK;~wbZ3(Pf2mO6@5(WB6~Xiy3GfLM^{j8#iuP@tlT_ho>wel6
zSR$NI4Uq%l4{>2b9tE$AJk<m}mOew9sv6(DquORC;m1ykYx##@Q-8#UMs$%}8*H2q
zhF5(`W;`F`s5zQ;98XPK9o=pMt(RQ=^9qZMB$L<Fm75~43)4+_?Ba#9_Nwfk)*qJv
z$7kw|{@8R7`z}O;_><5l>u1kjhrV9iG%92G66X>2`s-Tz7#h=#6)7nwt?CbEH+yjz
zXk0es)lsUU_W+50xSO(0j9`7|W2!lz{1h`PQ?rxn%h86|^chq~`2A&Z;Hjx2XzqW^
zO~=ejUeqRQ2QaAy#Y!)9{pIhYBRm$$REpq54zUdlS=V_ZcO`z^{>ev%wqM`A5QB0m
zVJx=n0JjqyC;Xl<L7{AZ>r^W{t^nWuFB26z)X@^JngL?tJoIi<&U%m%fhM6*eQL=*
zi}uvNx_Xq72_<Ar>U=DUc=c3N>y79&(Yr3JCyC*BN!iTtA=fWx_qXL|ej;hWcj3RJ
zU`JKVYpNf?IksJ&NODDD)(*a^5<fc$B6J8Aa`Do8nf$ir!dwcB3q&kIu^gK{Nnrdg
z##RE{V;884Q=0^H)_*tmvp)T)s+72saBoPeiCUyjB>>|F5&m+wNt#8QT^C2_LxncG
zdb}cV+mH!In4JS8$}2BYJ{vm)285|ZVnm6}Nr-ii<+eIm(+D261MUlGfl<il&mrMz
zG07YmI5dWApWFysR+F^k`F5}LG$YWvVoBg+$4kq3e}8G1hG1_<brR*;-v9F?Q)$(e
z<PUDqaMiIwDF6W#^RX0xYPCo}`g9Q22?gX@7<aSlc@_w;Y-u|-9^yIjJsbvvhnaex
z0Mo1KZR9FQ`rCtPUJSJb`eS?^#Z%o9;FgD%9u3PjS9ijqZqq5qwbD!rDzO_b6BL2O
z`+WPh23RXk@+AN^GURU3wF8)K?)%jcrz6ziwGvPR=a7|U6UqqgY@^LW;}4`Xuxu1r
z1O=vr(za&E4qX`7s*nXm95iz!vfwoe&r@}QHK;iDTK5fI1ers|&&3xCJ(!WB-(gaj
zad{_fT`QGh_^uX=wftAgJ44Qh6(;IGfL88{8~b7el5q4+(P%xS$NT)(a~{H{rO?zb
z7ixk=><93bVL`_o2JYafBp5L|l)D&3y?5*UDFw(AteJkR0hb+g=!@zQPrK~pf5!}=
zQ1)_s*#i<}#f#V)q=>*<Diq0me*@auAR{GDt9`TPer2Ak$S6r8q+I2Amnh+PF9I08
zbZ&gXmgD@VWWMfxfUN@FtD<{!h!e#L`TO#1@j9*Z3gl>InBmvmQgVL;L+WgZ{%euT
zoEbvlTxBZRX}_FzAK=A-gy7@_3R3gqpH{K{%l1GWdTiySOsHx4y>Gjm4f-MdO)q!r
zZ`na*IXA$*1h|~U>f-ZnbMPB?uNLpo)a(TWfswtfnzGNI3*@feLA!4ox&DNJus_rd
z(Fd1Hc?%4eUtXjw+jvopADm~t-R8Ktpir7$oTq`;+>se9&n#Yy%>>}6X$?W;Rz>at
z?jH7+Fw-=rulr9tzT~t>?;a*;sea(R$2C-aqETiOh0DN<{rt-;8WkoHl?XRE`mS7P
z9BI<wp~TVm`F=6pqOp12?fp}OvF@+K^VKsJ2lJ!*D~2`$5TO~*^J5#ptxTd=buF8D
z2nXb=XYK|wnU<0Nxr65;jG3xjSHM$t^->rrD?e%TDvzL>qo{4i!U!yOTgrT#MfuOJ
ziU?)v&Nb6@*JpPXJL<-nakx|f>$FwRS%~6Y#Y8%VD`j0YVngOAIxse$SK89=3>3oO
zR@Z(@F{)cm$+o+XH*@~eW_FJcJU`$53ys14#*aTufOStGm@{JGw3aGxj`01@I9Yjo
zozwBhE#eU|?|(Z=MOBpi!0Q6_(n7(voTeAnzjFe^QaJ4J2tvIn=DIQQpS^iXKjztL
zTz{_1w=s^CsllLv?liwfpHMFA?mZ71dAIp1Jx-R~bt;$fBiX!rmB`MAiRSW-T$`Dn
zAt_jPcnZ2VI}4)1Tp11NWo}IIqw1xnWA)F79AahN{K#zljqy9qh-6<$*zW0>&E#H4
zPb>#P_rL_8H~wTWZVm|#I6?Ug&+nP<oXea=AJQ2lZjYtshV9_>{CT>b&nX0q_4gZn
zMpU53TsMO{#nTne{zk>log_}<k6)&PzQpxDXm<K>DR<Z?XQwdwyd7LKKy38H`>9pg
z`VDB8LEk5IXnXysEq5|Jpk_tB4?I{I+We+GaI)Ud3|N08BqfoKXUEVyr1wO`^<H2m
z7|<(0jG!8Hfwtt1@T1cCr<C2#UVtOkMsuvOnZCU13g*0i9QQQ2KD6Pn$i%81Jol9H
ziB8maM4d4K$TfUl&!77_zIE9b=`iPx=UcsI1mtHq?H%L1-jO;uAlC{*nsj3Of#><b
zdxUJ@aqkiF)0tU*hTKU{V8_T96-7^;>+HwUxCTmQGJ>Mo$4M*=O%Ru{nN8eo-=1?(
z%qCZdaW*eU`_o4C+DX(tCV@Vo68La`ub2AmNry&>h5|C>h@ImC@l4t&@7J{zquD_K
zzlk!JuyetL7lG2WGP<WDML&%)>`%ujHt5ggpTa#S=GCdTaNvWRy@L^JuP|tkJ&OGT
z_ctoA@x!x(<!;J6^6M3DSE~40T42A_Jkil1K9c+5%R5nQrVp^(Noy-uu1lIVh?$|8
z(C^1zg#``G=1k?3s)_oqV&HKhEjPtiMzZoMJ3-F6hwt?qn1465`O#W85^~ThdBB7I
z?$Jj3^&t@&4q8=2CDO0xqq0;gV~-{ZStgxu^?Y3uMl(t$J$6ZrbzQ+pG3FDwlTlZA
z*85?t7sIzb!j$io`fOmd(AI#h1@&$nPWzi2epMLM>(rb=Si?e4*(f0~aRi=^lmKpC
z8ad;?TfDv|S2`W&YrER8q^IGLb;|fPV2ZVQJj(CvdEq@^@S~QLbM~F{)(k9-cnnQh
zY_9f085TtORxLRp!Q{{=)xbU7^{{N6H_r$gl<P1ET+G7q-$!>Y2kW|EKs#YytzXez
zuOWvP&S=3B_$^%SIhAk4SAENc(9t3|;(*WRUhUWX%I}N)H__wSyXL*1I_iAx(EC!m
zbyX=cgCSY@D$DVH%9({%IfmXjp?KBA&?ln2BPq&W1qHL|#dXV1>eeXg38v}i;(Gll
z4PK~jlg|8>eXv|Xi1urZk8_R{-gS!`q&=1tK1kzcd;W80%BlHz4q5rS#8GRJQDR6O
za~$|goPZQsrdu~?<Cwr(q)bY#yJifwG_?KpjuTt({PUzW6}=MY*&DZaFOZm!W^y9v
z<)M#M%G^Wyk5Ft+k{4hF2tvw;%~^dsE+pt5v=hcSRr;uk(j0=Jx>k4d+Rg*XvqWWl
zvyTVo0Svi}dS%3lu)lZdy~{ib@J}lsBkG<ew=*TH%nl<N*NtFq%IE*?g&`TDCi?P`
zkE3NuH7~-EH*hA6Fh{AB=<V6!c~{-WhXTJJQMbFf3A%i7>}(8S6ZbUx@7deP%1m$N
zWn95gn$HJ_*JO#e4g;l9UJ)SN-b$rdT03)MK;?cFV`XR^;DDfobV7J&C*1#CuVhIW
z@zDzZl*A`YY!GqXw$)oS>=$u$XgJTHi#!b1FZ7g(c%#xDp8wLFbsD!S7bvtVGLGV>
zl<go@gh5&Cn7$pRaxz>y*yOl6OzZfYOS*E@%wQfiWKnGXI2s6}<nB>~z0sB#Rj{!<
ztVJTOS?&iB7PtJ6VonAwnGITaUykr49>}%10>l7iIJmZ4FH&nxQPG5N!)5X{7CNX3
zzWI&^?19w+Q|j&wWL)WNGGE~9@ikWC(ksm@KPj&vf`Vg3EM#98VnDpBtm?Akm<=m1
z_9EUPaRuFY7{Dh>mBF0zJ|iyf7+pZ%!qJYoBYgMIBMd1b$B(dF8^93G#;Z?uf-BC8
zVPXQQswdCndX{${6fPQv^m<A(&Yn}|vXd$e#HxI0L~ZZ1n-Tt#vO2+5O-Z`n5}C-(
z5eqW)0Fi?ZX?$`MX7G5OvMWZ^V?5o8k9rq9<Q>Wl1(Pixr6`1}=~r8-u)S3taFeFY
zvZ6?R4DAH_AKzy_K-37L`9Niy5052_51Vl3WMF8vdo9tw$H0(X<Y%ZS!>3b2E|sEc
zZ#E83iDULop~iXi78kWOp$uTBbxr9E!tnMl>K6=BWGjBMFycEz8Y4s8Gqhk%P|mQ>
zzn@f}1n@xQlB!rL$s@M*+BGuJ*Oz~9IZi|@e5b9>1NU<>uazC+B^bFV@N-{Jw`%FL
znBz{0ll92VB&)St%|*EFoK&1TAMRSM#Xcs{EnFbeB;eP_CWTt2z3$Qq8=46E`MHCi
zj}2PbSQ1AjbG2|$^du)`=HP_arq1$<8Bg=!0KbJ)I=hw8R6LKcrZxp!?msJzE0n3I
zti(-pb(9{mKXy7E4!1=N)Jjb9bj;OH-cQ7{&v5=AhcrEAotJCY$=HKEquI^vjMoGb
zaxrWlIK6YFubSaRP?_1pJo;6EM@6)@y2`<v_AMrs1Zbn!|6P2eV9rtfA?jz3ih#`x
zA=|pde(~iWw3n;j<7BDS{|=)+WLn*oJe1>2(@F3$m;{BBe!RJUN>t3w&@y>nZX8z8
zGNC0?5&?TdbVYBs5gtmfdxp4H?NAN1VVMFR(d?=m!ZbN!g7z4}_et1#9l=YmX(V2Q
z-TbUd)Z0A2=RF(=1TNqp8d%Ig#&=<5xp^K_jB`L3$nD?fW}usUTH%-fC?&>uyh?Vq
z#7n%Ku56?)vwGZd*ig#tm>g>PGcPbS6r{^%xtm=7H0H6B{Ad|#&GqoLU?l`6!e1HA
z;;?WnPSO2bEL2(6BKfaXmhe(~jP>}q#)7|%0r^rzM)(ILnlg-dU-S9W0T8GW1LK5_
z8DG>11h`nG$R4zZ2iC(j&e-oH8s3lUdoi;YDR(JGME=^HbpA8<$mJ)KLIT9gwN6N}
z?0*i^dN618_0qdG!egFyu&!r7&;w${Gqeg^&@}|)dn?3a-I0l-ypOQmVI#+XyM;kt
zmiNq+FDPaLrqcx+qK)v!td#jW^f8szw=3yxAn^}+QN0T+Q;zgAQe^m?e8<0w>&oih
z`_>s1t*fo~x7rexNdE{q;&-(Ha_VQlAGxz#7FLKJPq@}sC|nlo@&i57nV>!Af4Bf3
zo9zldJp7wCWKt<F{-K!*j_ZA-@igK~)W5NT2ngRDI%FL_-}d#d+U9>A)>SIavs_<4
zW~M}wN7^P)jLp0b8%%Dfg52cuTiNi%jK0q0`cIJbPysgD&C4xMZ1A^#fWRZ@LWV*3
z?!`E!{xhhhWlj`4fQg}5E=qG*u*S2kqQ#i!`soI*m2tn3VMS1=1#QAgyg-nYynrXi
zS}^B~lWXm8x0Sk**%K;PE@GMyr-IbPP#{j~!c6$BLc@2iS)*h&bVA*&8=cS73C(|{
z^D9x_Q~`-j6_#5xey-V9i0Otz^p^yMa!}6PZ4bMfp)U&Y)s7=~lPrAR*p@Q7bAM>j
zDz_SJrHP4S_OTy}AcO8utzDZprsU^SC-62@Sj$GF{3>1#+ng$%s*VtH-i)Ufazm3*
z6vabDmN)d^lK?kDsNlttH0%$goJ8Jz&rk;Tr=<ZnLX8PRENG*6z;7LfDgF~*1Ps#J
zr1=Hs-OcZ!YbFbcPG{$@0n4BTsm<%-){^ikSZ=AbeEklonQ*h*SoslwHx+-P?DW9j
z$(=lT8nN8Rx>JTl7Bz()Lw|7#K28_f{MyO>W>JX4ktvXweov!34_t*XQ3x$ftJhNf
z5yt{K1<#73s+)1C6u*qEp;m2m*7hOo*2r3aoukksOB~LJGwylO2QimM%1rsedAq}i
znH(Wy0i1vH`BVC+hZttC(_t4Y*lBbof>Go@NkT7}Q(1q2n&i<o<|6W@<kST6=n-m#
zsUK;71LI(Q@dLhp^QhtOw<1gmMm+B!bKQmm$}Og#>UZAjWT97bylXLAV-l)W*H$E9
zH#mC9uA|&ftDZ{_l#%$oWg?;fMrf&&vX)385;PDasx+#R`&NSrM%ckGfmfA0hckY8
zC;b2YixCGWRO0UIs|M1V0w9vT%YfAZg?Ht$2Wl~ZH(|7bGes8o3k}urI7ZE*{ri4J
zFt9;T=}Z@fRR+|b`ff=-H0~ELWaP5Urwy*~l`SxV`xv;onFpu?uj?SlH9!aACNWf|
zrAW~5Cy+IABF&DF%1cq$AmH~GgIoL9GDg4bIx6Li!WkJ*Z&pV7OB^$0BsF><2ALz!
zr~rG5>9`<+`6DZMfOv~jECPrvIG`Ye9Db+@t%#+-sKL*&jX`y`rI;{}ie!Lbpcn&b
zVs@xj%o>ZKnPc&JfN)_6@WhMCjwVQLQYm%5absq;fR2%53%1I~&@5){>h}{IB(+hQ
z543lXDLS|3CREDsvm3VMZNQS>Hp*Sj?@dMIIfG*>a8oILx4BgZYtC~rZNMwpHJis3
zZfZYVX(k^VC@Cc*1c@Kz&;-DiCAQN{H`q0_A~o#oVv>`Rq=C2;bFjjI)liurEuu9L
zzeL3(?<}sq-DSc5R=tUN6SiApALhPO>rnJfZ#lO)G8Y!e)iE@hypq07T0W(Vt7&Q7
z?!VVVq%OfVa22WG9GKvIJ%ux0bN_a}#=d!jU0*qJw(++MB$`#s?>eb?CFh`YV>L$8
z`SzE<KzdQte5GZ9Ktg431#ViC<&84+QuYF!7jC)@x2lla-!!4rlzi$YwHsLApMM(@
z4~uG+D0JXv0ht$g7{E@oCW~{z1vctGfv<p-?|NrSjzM@S{|o4bh@p;d^6J21eg9tV
z#EHs|<)&*s4|mj-Yd1!rNxTHTcuB<^Tt^v<SPg}+35B<5VF(JPidNbDQGU|`jU|rO
zEZ21ho&mZQQG4-H_&Gki=C-p-nL)tq+eu^-UP?;j!C@-!g~xsd^Zaoaz?0I%)4*>e
zUOG*qb{e^DhDeAtn@%N&3KgMH3Y@4b@V~&x4Y=W0xSw7*Po?4LcqZg0yi_=FrS!hd
z-a+ABjo+-V#%<W5=`UG|OZF`sL9aq9_9EaOnQQx9XvQY9EVEX*uBDYz_GSH<HzfNp
zDLFn9Zuazt?@O;W{U26Y=G$!*o9%}9+YI;gc}Fq8r~L+>#Y^Yz9~I9bSD>pSLlftv
zk16XasJS-v7XJu^hSDGkzr~eDNxYH#R;dO>7N{WxUndm34{^<~g`7ZdPtHxZY$#J|
z{RNMczUz^-^RUxh2;A&AQ6X<(53)2{MgJZWUk_}0r8IJm7W%l6e-C1YkdPRV62E*Q
zcqB!}jgLVX{vNd$EI@1z5YbuL8PzUxZbo$8vv_26Z8BRNqHVeOeZBm(Rj}1L8<BUD
zui7NHRlc_vGI-gRRn{<kur2JjmruRe`~hj3#(Z7hBo2G&>+Zs?TB0R0TUb|fH{#*9
zuD>!jlk3oZ1m_!Toi0C^xF&x&tLnVP{`}1A{L$Ls^aF_><4sR|+m^9$d}R4B)sC(d
zK1i8*o;9q#lGG2r*t(gS1dh@CX^YOTe?mQw4gt7SF?7pZ{RD@vvi1B52&6qKt|gz^
z>v3Nx&BP_t@toEk?JRd}Y@A4&-`p~nWVjSh$7~u6opk%;t6s^ewaT%^YWI<ZscSE7
z&jprmsphn<l{+jU_={xaS$Eh%_Ftmf$ps*{bXWkDwYAY*?hewM?o_)3vZ7L=|C?u$
z1sPW$w;c~P8^?k^RE8AKI;cGzckH6vH962P{PeciXGE}r5qvw%Yzj;0(BQgSFO`>(
zH@ST2ZS-^HPK^B%!lW`|$IpA9Wj1n|(f8S%f0^rYH`IUOeN9AVob1tf)N<3{;=@<=
zJq<(&`2;52j6V{9h;quZ1sEpjOWnQIUV8X)O2eEx_ThqK(Ix{r-;wv@(O<@M51tl2
zl$-9nPiATGvpDqA8u!^di|5}s8(7~`qI`UX`>o86tlh0?-|#f?%Hk_6Nr6UNX;2QX
z7gf=rY%phgOHds1#!wOsVK@#Do!co%ero(DY$yIhk4KIPL|3NOdOBSaF7SMVRr|2d
zxu^@ir;pO?ecZSxXNgZi+5I8Tl?{?A3+oJt_kIO?_HFk$zLP_*_t2tnYI_~z?*L7%
z@w>iUf+wX$Z>Bz&ktl%PtO=7)TUtHzWN)qALkHdtQ~>({-*aEKsJ?h{?RBy5{knKJ
zX3gF1WUhmt!=+pFj_aFUf&``lgaMaZ1&zhB_zI9qPpifA#ac$h(leZ|Yi&V%3NsJ4
zklBWVavDC{VyAg*_Sv(?mDE)J{>n4AuVB72JAEol7FkkIPhaw}DaC8IgH)CVe^Xk9
zy{{7n0U!9`*>5c~Kp7V24R;f5n^_%TH=*t2`2<GotSJ{c9qr5#t5GF8wb<k&COcXX
zi&vD^8NBV9<vMyBk>SF+&BJX+XY>(9y!BQ_Q8Drfjj#rab+!wbDWu7tbijYHH-YC@
ziX{t4nCIeTrGq9}Rnl%W)@t-%^W{fx^7QzgOk=HPn0FlFF!P_;!7XG9<L3MyChg87
zTOLFCE1xqL>>1j(XS?E-e&jWL%_vf)w!0XXztAh)W!(<IIK{<fH%jwzvkJS94VVH3
zzMETfw*~C_&EXp1rj+w+FN$zX3h^_g36<nn{wkI&nxD9_9)EejB;@dEzQrl#R_IP5
zOZAsa$I0sivtEk}*ve^N5iLi6Q?ct|w>gvQVVBv+#ZP6odef`7(6!W0eY#Ph?bmHt
zPSye(;PFrID1)sm%O&N3zi9Tb)yr~LcSwYc=k$t#Fv}zD-}KKsaO+eAroZ_BpSjLq
zE@q4XFhEcqRg`jdq3lh&bC|eVzM9Q|dQ~*VLH^po%IR|a$mz=YZO0GSJ8qULXlen8
z4-A>aGo1Rewsk2w<wTROb!PnC0*`Px2;Oc(-C{%p4L;{08wVOKqLXS~w0V|)D;fL*
z5=K;Ua0ztSYiN@H1cn2-Ng9B)t{V~$2q+K+7cu+#C*`&v(fK3(JVyNQ%hw;9&k4(D
z_UV-hPHE$l)elca7bghUrcSzZZHRSb=}ZKgcPGC%JiRH61D^Tr*Azcjn_mr8QJRPs
z-yE|3xQRWY`IN(`QO^n4n3bGmde5yY<z8{SY`e_`e%*h)p?n(ZTUB7Y9*|xf^@a@S
zvL#a=vjByJVnce&gw?D^IR%j^0f-WLtD4d}%Qg9WI!B?ha(FgRQyU+|XFINTkSm!x
z8`zQSFjIMVq;5`PbDjUW95OeGLQo7S2L5!+2H}Is^3r~-=XRzS9xpUFRixW19tr7|
z5s@Hk;70-1ZZu7~D+Jf75~f!+qf}7mLG-O5;fa#XyHnqa)3wSFWL&RpY7)~oZ+|9c
zYl8%<hL5~P$(lyzC%#MKk=@sSBFB!BJGR^H?$bMD_^lQZLmxT|Lq{mFZK%rr*fB*Z
zYa`(>UGoJlJJU&cM`D)+aHXD5bN%3AD%?x%2mj7Vds2yqS+iYvT<^6>h>LwPWoGtC
za5H6`WoF%+G?|T0rd*kRB5mJ$^eV=i`@hkBQ*H)8XH-6~A#W+rabqzVupAkx$<gut
zKm>mQF4o?MA)qy1{fBDo?o;ge;mNeP{7bvbD=*M#_q!SdsN6f_-DokYLW$>W_lb_{
zEiO%8XIL#mL8oq7QF;Gvj$jU;%&nllq5txQ2vs=(@lcu!QEqJzr?cJ7qkFDDpw}3*
zza*$#&vT>iUUoTiYT%@WXvyE&95cV>u9gB`>$voej#jJ1vx=0{V77Ij)uQ_e=w!Y_
z3=a#DF;Y&e%XwYzA_$=RI(EO`mbzI}XEJbFIE&A>9cs_FewJ!JF6cZz%7_=C7+Rpi
z8oJKDx5{WYHoS)zZs`c`nQZSBLy`UTSz!$`_sc*xs;ws8JYt(Wxu;?27MA2N!|SG4
zw&9!WAdAS@LKKM~crohHv03XRIR(dxfvY=q%k#AO$`)#j3zO^?4fbwVIJSI&MP4wJ
zzalN2SpE>0PoRJ>ylGCdn^>}?D{o$)dxH3mfBkB~D4o9~zUYAK@Lm(BC5cDP2<!Y%
zP?ly~TyKDO2r+aA{o3U)8{xv#x#w{0E%##wCPeedoQFa>WiJ&!HPY_r9MsuV3E{YT
z(EThf(Q?o+0;^F~>WeF|O{7)3SeW(AKHYSOJ{-wzxzOa~B9e7v<tuN6%W_$FeiKm&
zt99#*JNopwoQpDAw1#kcuT0)#r%d4AtL!}>3_3>iF8d0$mcQ#Z7^_v^-R;&lgmD&D
z9%h;y`YbE<0lBort)*@E^fqd_eIgn;o@-OCVVUloeXhm#?%wLNLacgJOF+ETv+v#l
z(_;Ztpveyw#j?^?ncm}i>0*WH)QjyCzDXy7Ifi(F_5b!YfAq5An15>yY-9HnqIS=>
zk7m{(D3Ad_8~A|`Lxj_K5T*ryUG!0gY)yDc30od_NnS~VyOZXUmNw%i_;RkgtTy=M
z2!<M9!>n9yIgfD2TYUenRMa@$FzhyN&9(V`X$J>Os_pXuebk{|*<O=5{hw&J$xn2r
zeeQ$?Z~JU5sCtbvPmd4Ja|y=F#s%8tOm?KldXHQc8VtWS#P!~*&a9y8Xa2QB;&F*~
zPJ+shlD?-(q{ej7LU*u)tqVX$50T1@`LcA4dsI7GI%Wi*GZ%aw*RvSqNBOcv>jRk`
z!P_|&wR<I_9p8T1qw#Zb9bR!QQDP(I4&Z|FTi?W1d<S%4qEq*=NF=IG+EW1Pq<$$J
zP^;Kn1D%2VW`VC~=~Y5SO%B3tE(!>a=Y<QRw=H9Y?=Ik3e+xJ>(<l|KEq|;z{sgKc
z;vLDh#}mSv$z(Lo3NM&SZEx;euKd*Y&Vq!`jfSwuE;Ap0DiWPsRfuYclPz)+ZM7*+
zH|Gf0U3fG*0<BcgtWystUOYK*r3UIFX$RgvF!_P!P`>snmm2e=%3R6n<)OL?%5Y1c
z+vN1IY4WxX8!PV{0<~J_h1yUnkzKy@fcxv9tJKOb3g`lDD%RtKt($i)(g*i(|6S}B
zDr}Hcv&{VSz%(Ev9F@LCr~3MAz72kH?ebb@>5JxI#@ot`q3t`&q%R^^)t&K-@*BX7
z_L8<&V=N(f<S82)8*#{pnLZJZ!=(ItCU!&7^NZ<sW`I1``R|6pKq6}_Dd$eW(llVt
z4ja#t7lUVs`{}wLC}uv(mBBg6%R$e4m?~B`F-Vk+B#qZ^a>iWfJ#9KHa>K4Km!wO>
zrvb8=Q&Q?z##R6SwB>&!z{RHlm5NQ~Vr_<V!VmYA(qqUnqTja_*&|ZNNE2YGjXOpg
zju0beH?BP9xT*e#5$cN^RQ_SKbR`YvMms8>D+(EBuX8OY_5s})1YDiYzTQ&W%6Vw3
z;r%ELw(GFk(gB@MOQ;*0Jpt_#;JXEXgv)DtXJ!oxFE>AkaUlxDGq1lImMi6tf1@iZ
z)dLQ>dunRF?`ejg^Sm^fflEQJ*rZ=Y0Lei!x8H-e%8MdKbiU7jwOn!OG6&_Aeqe3o
z7?Q8qA5DGzSSlXOdGEuL1RS^Ab)-*pXtc1}pdpZu(?e$?mD6`Bso@=(g|Y85mldOw
zISN+bIZ4sLaD&kdGkZDa|FHL#QB`)`->4!gAzex+h)9FdptN);pp=9Nn-(QST1r5W
zkQPxIDQVeAC@9?s(umTfbo0(@<MZ6!zGs|q{vXblbKWlw+;FYEuC->*-<<QV=cZx7
zu)F$RSC6YNWg?b!cg};!YQ+0;c*N;BF;+^!v*S03A~VDvJ=sl3aM$6+w&MW}*N#A1
zsAl%>eC3Q2$P$r{s<jS=K&f5Z-_lTY1@aKCNHE<%8~%8fzx^;>tH&{$?`auTMaaIm
zQigs{4DP(l(#E6Fz6gp9_CuRGpG_)SzWV#ubKdh98tP9uea_5LuOXRe0-r+hU`3YC
z6_Bk}VMJm$9s`!5&M;O7h8g7~=}6Q@ycBg+TpTK&NOaykP`-CYC5}htq8hYLU73o3
zaeeZQ5eLdAt(=G6yA)mhRYztw|Ml|BXmxc@p1vO6t8$oC--`NP|BQ%TJ6_ba^!-n%
zStSwCN2_7Y+|@(28#BU4-Zl>A<l?#&n|C17n`8I*$CB$xA%JMTK~y45KMAoWu#$q8
z!1<7UvGD_y^!l+u4!Wk0pY)oX40-+oNwn2h5G$G@B59%3#)j1?24!Wb$|)^#{@B|W
zA@@1*5qD=F-BGBDe>aLy+9@r;QeG~ScBpj7EoUxydYh#xC~xi3_kQ%tPL4Culyr>`
zO!^+JpdN<)*o`|oYkGAt+HXmzslKu`CitePTKv$qBU+`)k3&JB>rRJLV>o+$`!+&U
zTCcYfvoz)&vUlcX?|m!9jXkRuHACPkI%Qj%-T14vLS?gOz4K-q<wc^N4OO>Agtqb!
z+TuiLizQ6oD!nneEbkI?kHwmUW>n=@>1YW?&K8dhwPrOR#YeJeKi<kjkYk-rQv^e^
zRM#>t-ahE&MI}7dc+Tlw9bj!3jW%+fAbh6y1^?uxXtY10IFSN@-(!d-7i{k(#55pL
zoo|U&Eu)jw&bcc&C8mj&t~ZR^tUjqM1gTZKOz7jZNsz`Xu-fo4#P0qa5AT|hrKnay
zwWH$ar=NgM23uT9e3<!~Ha~jj;qtm)bL@1qDxCG0I7|eW;W7l19cWF&;s|jgwr(O-
z@F*{Fp>MA<RKW!A!u*&Sr><9bzJCru)yaQ(bCZ|ur$5cTFx?VMKc{3CYsfAqGfKXE
zG|Q?}VC;vh`qFJ#p_|LKbA7h2q_t1qIq|mRmz$H{&3!rAudY1h&_`=R!=^j3sZXi-
z*}AuK(es04YG*xnAJ~s5MzJaQoXE10;L<IY7qA()AQMa(1rkw&8ZGlWfpnkPmjl-m
z_lTE-2?QcuM}>WIZXj)OP+4b9b5P6!r(MO_yI(r`77^E~aB|&%#l-WSiG1O#mBkp_
zP9<=`lft;QT^mTvzpX~r6E?N`G(T*6O(c|S@oDv2e|{~Si&*y!A+%Fa_3}>Y%ff|?
z_Lhqy>G=5Rm$!f38#TmlN$%03ROP84&mFoIKeF$R9_Gz!?3(PP@F21U9KbMzz9&-`
zskOf2>xI#mSra-3uYYkG%Kw_p>VB*vbB=IM+L+V(V}bk}UDG~B=ShIh69k=y72<@$
zWRm6h<#g8l&?QIF9Nl)eDsS8~#eC=)4b~eg$=A8K5e&dB^;b)GJsFxG_i^!AUADk2
z-R~70FoY~i4~e_TvYHQ*;iszgTd%^>RFWnl%BK6po`GI=VA$z>wtKeYW}kZJPQL1j
zX|CRP$BfhqWXrSe3c*OA-~)R`YMx8v<>}IAk5@~p)M}di3%n1LZHbUI{p{Dog;VE+
zu~UwA#pYr$DE-i1fl179oopgZv7uv6qDbNX9Y+i92H<iN(uq9hx>hj2e{V==3p%Q&
z-Kq7*G=-E3lb5DUw44XjU6hF9th_hYaTAbANb?D`hY_7?FykP4)MH1bO?2b-%y2~+
zE~?nPVyy^9l_^v>EnH2pGEOthlzRjLr#zhh1e{0<HwkqfyL5;il&BUu*K*ja@V~!P
z_&lduczA6B;<KkMUSAoY((SF0^+~u6Rs=4eU$EHogmse}?W}EdZ~J+fTD)_cP1-zl
z(>>DH&b{xRdOXm3bROU8B%Y2#<?~|+G3ixzDtXs9t>!|k=qT=TdrMK;GV1f7L15U<
zF>Tt17Uw;h=N}bkrSUjpqx#+E!4ebMKGT(GPgAjtG9%~AKIzoKE0An>Lj@7}vrTzn
zGRF)E9vzFCQPcYM)WGV|ptE|hVueIS>dw_S49zq#4KE$LTYotcFzYw7UhkY~a~R+?
zWC-4SX)PY@Prjw78THtdCf4G-Qcsnp!-bAf`%!^}<9Iq)D!C}QX!kgv`{hCM${22!
z{Dk@2Ytrcp4!(vEmTY}=Q--ZXfcuy^eOk)x;!Suv#g>`c?6{KL;!0|-3D(;q^C?!N
z)6_ybF4cCJG{{cZc@tcRli%#IjWv+a_~&#x>z<%6re%-&AWhDT?Dc}WVG*{Ue9j>4
zFCWF0rO`Gk{}MW)#@{$Ek{IhQk|!@6-7bDGq`dc~($$&u=9}lcQ{o5Vl$^Q^K15=%
zS&<HPtxz1m6+m<@y3fG1)t6^IS+di8W0)D8*PNYSMeV|Dk6jPpUfPt28k`IMK!4^;
zCco$kv3Umo3?b8Zw^MlG7zJH<`0}p>R1w6nW57x#H;++H8U)$9vcH3`<CMRpKv@+f
z^33w>TA7;Q4+t)ljW||7G=NNhY(%S|n&K<~avbd)L})z0q3Wqz=O=T(X6t;D{c>t-
zJ&MOwc{W~Dgd)*^2u&Co{XM`bu7uOUL>b9241c$}))x}!H;yO*8$=NZS4c%S(VdHY
zllXE~PuP8Pzq`hB@)04b*x>UqSzZ$zfh|-&`hw#^deFj>;GFL!yNJxF+A*crPFC>4
zcJ{QNX}rCCtY|TEBmqL!-?#Ql+QOGRZ3<0q%w8QuPgnYvuNxaiG7T^^2emJTS5NJ&
zteZ9t#3;=8_sPFr4q@^PoQ+GT?hbeC?YSdDk#ge8NK;{BSwk;V)}87+rZg+XL?N_c
zy10J7GFeR2f6R(%B^({&ZwDWHj(T?>DIv>gtiYm^z~Jt50QEWQo9A4JYNaj`a*N|J
zO$BLhHxZ?U(ls*ZbL&StW)HbcCGy<)h`j~VztT{x@le$s^Y9~~euqcf3*kcRejc4_
zk4kd(TgtP2`L?~63b0P@$VWqa5WT9cNgAE^4-2gO3uW5GcC3pk&+>kEFp7TS&r1D9
zGxNISOBe~n`w<G#Jy3L%`p*y;(#+5ExH^1%F3`h7xxz6hzbj2CVLeIc!N;7=hzFN<
zd`=QInI*ZS#HzO@Tddw#(yh%=lg>Qi+ZJRJfTXx_kzpnU)&odMozn$u2#X1;W-kXG
zUFLN16ne8f+W6*xQsiJ2L9tSLdkSS?&dyEBs&J7#9lI*yuBwC2UXGnvZ4OPmoFh`v
z!&aW*pO$WOZr|s<+bNj3redYfp8T*ixMiF3))+a92`<W#-?c$~30;<^T$|@HNY;wh
zWsVnIck3^E5W1C7Ntl4DyqI0(*5Fnt)jXKi8<oXqVEp<w3@7-&pe--#)1Z0wQ~ob&
znGX7FoYX-t2R$Rr&>xKQht2&t);o>U=A17xe%z%XxZ#M#Yzaw8bjMkA-UwPd;2&8+
ztF2eP8G8Lp6RN?{-B&0Ir*$t^6K1nys3q}Cb+62B?L{S>j9T{{8cF9aULP*m_AH!U
zU~d%<zGca&;(EX`812DU?Ro$1z}~Dtmg7XMQN@+yh2OX!NI5^@(s7cdAH?KkN+myk
z2*lMnzmzJ$&>X>}WYQ9Q8B{w7FIFme8p%7;@qn43d+>TLNn57xicW=7Zn5p?h%fD}
z&f}B(UpvB3V>RB%lEIMT{c8Tues}Xh(e=u_tYk)^oG>P>yxWtT)V<n_YO6!KNY9#b
znCn`3f>YNpQNS7+S=xP~(2|GHsBk;d)L`F)Bixtc(L8h<Q~VS}(Ta11Qv_=QwA7l5
zn5SsQT;cRK7y4E+)Q2!GnS_qT9$^T+TTFY}yN>uUN~1#t7AU*OaJ1NyRV@hbUKiNi
zeczv|Tx*p!;<)6y8$yu_zT;v%KRR$~cYRIQ?WX0zGoRMyX1YB794E^-W47+L3))Ir
z0jBJ^rummaWp2&4%AyKwhF6F0tmRs}U(r+QY_hpV6;w*K6VK5}sFSR(l|ACw?yhvp
z%_C_mqsn6B?Nq>I^};u$l?o4mZlX_$4`x54e-Q%!gJ_~Y_$1G|X1_6cWZ)EQ9J%&i
zEPyA`EVcCZTd^L})-Y3Nf1`JfN#yOT0VA`f2Oce_;gxi%2}K9VZu3>#eM6oVE8w0S
z<6}|tS|ejd_m(bO;?670ip?wsJrRn+dF}m@X?MfmysR(#f#Q3izDj}=CSD~D#rnKS
zx$H>XRl~nzp+a_lbNg~XT4-$~Errw3MC}&&p@ZW3ER2ag&$`W9VihAuR{e(TuSrEW
z>~tA6*p1a3b%9ZSrb{E0cC`e84QZfPy)qdEk*eO#1GQ+D8&WS_7lpx>b)>|C*knrw
z$wd*zXs+qJgT=1}DRF15pko87(`8RtE^pci?a{=Z|NcBaB}Z6qu3kckN=}%}vp7Lp
z1DDNj0V?wLvpuXu!%EzhCXdgE8iY{uzn97}s2p(1tM5wQnPZ`}O)&&Xk|QA#f&|p<
zx!50e-rx|4XZqU7GH{U}3aP~>w&(I=+7i3f>d(47v-dT}BQCtA5qXy(%<ItTWU`W?
z+`HB37wmZSt|!w(t)6M~cq1xOaZ2?37cOe@WoMmYsc0}TgYsDKR?-bG@4KK3Jge8r
zRAM48n{0hgsBh)Fqq|IzZ`^zIXW7ppF22g`ik=y#)o<xyKZ>cCrK8RJzP#B8WZY>f
zHbPH6&5P{V4<xW#o<15UR)~%(lz0Ck932E1=v)CXfR+~K@c=EqPx-8Gr}nAYTx`}N
zmd#wgU*{<>{WeBrH8*ldG$tN1w+yh11cjz2Kp2j;KT<@B`!2+b=jN$&A3E8rH1IU)
z<wUfnLvBc_ci%a4%0BUyQbu1PRR*Xg`FHPS*cB7RSjFbD77RqbiS(Dx%C!ri*JJX!
z$Fp-Fbc`gO)Wq|*?u1QG$6=*eDm0&*H0xhz%5o(EzmMyJNn>Gi0vQAGO-w%MCT$Ge
zQ?Mim*Qg5LLkGxZ=-r_J3_bDERo!sEh$@j<DdXqtKuNa4OxGtj6F(00s|wXPz*u!u
zaLwigx6QGTxx!3}RV&lxX#Nzb;??>7?p=3qGvdN8uN!mHdNy;GHNd3SX|nOPKEoJ`
z?)13!wRbnOZ>^zi-Z2E9kkb{n`tZoAION0)qN_i>tVgScbDERqim=(x&qAr=uQ=yv
z<{RjrAfUb=Z1)4Fm?p_A1)N!daeGVbA^P76gv+Fp^(LebTBph+KL<p6kGxoSd}R)S
z#ognjeyNV`C-gHuXmze#L&XO~d&aAPj9AZdejL|^n%NOG*(vN1{9c)>_T8j`bOAlk
zIxndUS+rK^S&F1lD}heI(KUii?Zfn;Jh%vDMZ6{oT-Vj9M=l9>;}BMA&1^FcYE<z>
zwHqeKzpx*Dp`60Zr#$qtnFY*=p7-*2Ns0SZIiM>ZC09?rX8`M7wLCLLWY*DDL$Oyu
zU$2RcnlER2{DXs4tO*cF<(s6FG5o<*R#}g}9f54wjr>GY!{DZYS8l<2sq0_3(+5Yg
z!c5!!<4_E)*?e<*hOFvYH`CFUIBPp6MMXDmTKPO09*N)yL2tb$Ug-Wf!KHZ%kOmQ)
z|FJQaBmqse>UnqevBXbI_INXzc_iAGBYdO08q7GJP75@jP!@FcPmgR(5LAN(giI>&
zeBom5>KT?LQpkW$r2s;@t_9u)<JY!;GgXXIKB#<7X7J%}suPQlk$XICyu7{)U0UKC
zRq~+2OF7JqelKNyzZ*OY-Y;>K%(8{bi6S>g+*VX928y$;E;lgF%--3feVyV`e0D!5
z)Too>G87B)r1+dTbvi7oHGD%r?g<jUD3%dEi-ilhAB@Ss1>VD6s47`{x1UAlNgbPW
zdZy`SxFy$kP5Sw)UgT^q-_h{9?Gh}XAlBSUGddFFE=sv~^=`xR+*`$$Sq_k{ztzNF
z`N6E)DL3w-(r3f(Iu$Zhw)Y;iuR07~IRns?y!D_ZNT_nSv9G<h?f-m0niy?fA7tD5
zvB&0|zQwlKzc~JsPBBW@ZZf)PaDR~F>C4-%d?AOc1@PL?)sE!8Tvf73n{TV%1Dq3B
z%W~9Lmdz$|dY#j(?iWLNgaq~R+gf$nOJ-IDwiou<<~5OJ4gC-|_gn58DpR?VGeg5q
zzg~K=8M3I&y-epaN_b-)u#C>jHfpSL2p8`W^%r@drw3F-^kXOq9QnMu)ZZ*pbd7B*
z%nZzGHYDGP4n+&UcdZmQwA&|DU;D_clCW02VH3~o+N%<im|Z1!kXVp6vd#4UX8G7!
zuu1#&nU~yF)nzk9Qp@CAF{Wj|!(A0&VNA_jwyM4oniy@O`QWzGCD0~UdcfVi^Mty3
zb3AzRgXibuZVyrD3W3s(>P*q->nR5J?88%v0dp@S!}?Ep?4TGj`X9l(P+pZ^X-9)k
zaIW9rTmL?tyo_FNMmx?ae@*%3=g%pk5|`H*QQ&(e;*({5=+&(2ucgqnvwT+Qs-P2c
z)UsKTWp+QuBwhBmrcA-IK+zDGD8_KnxlGw%*MntA%RGsV^G@#&jAEkSkUV;3B7W_;
zu2yB3MWRn4dv(kx^QcLBXqWobfkC)kt2z~BDV&7IC8Coh(+(O@ZAt_Z^Gxv!%>Ys~
zX~Um-cy=Tz)3HE@`J&cKshPH{;Bsz{`~+I@ceN{P5Sl!JPkqUp)!;jpg<JR3<Wcj_
z$+o849y@I9$t~OERji}4uX~PX=8hjqmN}bVp@(@d<I?4QvNf1P<I5)2yv<V*#6rqx
zG7<gZkLZtO&>ulsb1e1G=A-sJIe+R%5>Pl?-XtyI`h`|0_2n@0DF3oE-4!@2Q>$yp
z1fZeLa1#V3vcYM~5Ud)=KlSMrN;E<u4Z^cOa&9>{y@=6GP;m^&a(b4pP(~P|fgt>s
zKCBp4$U9(%Rzv#xY?epkCEnQvYDqK0b4`6!2~*X~&Ek8^^z1iQq%0h|&Ahdfx62O$
z5W^;WPqm!~OEcG*PD&p+YIa<`>#V9SzReX^fzX$U0Obmw13&N4+N|m0K=2|}@F+QA
zXG{_VZ3V2-rqPUO*&s@GhmPZej^CVIJ0)a&&Ukh^=?v;IC>;hwdGfod!2>j!gb}E`
z9|HWZpmFwSY~SDMmF;xY<x!=fRBb4<=)PElQn@74mox+6^>mfB_qyVu(I@yts6WTs
zam0QvR9OcD!6aYYOpbmL2;$??M{f<g#=>gpb8#g5XLy&gY=5-@$YI4h4<F*cIRrYZ
z??v=gw<ZOZzTv4%G>n84&OOUo?wjuuwMrt-ew4>IvsTh~#Q2NWvxfAzyDEG36YOkL
z>+QQgx#Qt1&H82!8poaU)_b@;v*IW37>5T<HT?3qf`pSzzOPcXW|HkB3_;rLDWkpv
zUQ*MPD94u47aqpjt14FRyl#Vqtdk5`d)gxdvltpdy(e{CIH6zr!C-dJ>iu(z`zn0g
z8=17tNDg=cM`zbr!^w}sG{HfzTR(SunWP`RdROg8PRSP%`bW95t2&Rwf8Hzvw|aYe
z`ngJxVWfD*fn4a~-dqyfMJ5IB2>!Imn|D*Sqt=(_x})6vIndzU_Nvfb^lWuu*YJA1
zk$FB!mnI-NYB1Km#VxOj`T4!CE7F}uYS~mldxiSwY5aI*yVIB_zMqh#<Zw-?Q6w?V
zu3oNnz=(QzHeTeGrzL&rhp_x)Mu&8w{Mj>Ek#zqkom&m0Q3rANiLr?r?2hWIJs!Vz
z6fsBpK!7l^30Qxnuk|O0WV`vsM5G^FNtxQ`O;EM?N+K9^lAKjblK3b#Z)-e%v%^&a
z>g6AecV%0rd+wkwDhV+J^M~lz*%e-0>(NjzFqX0Bn%hM$2KVZ<n?^wa11{Oc2EkD`
z#)W~B$a}-@)kG_dK46iNTF9r(HJB&nB+r#(E|l5~w!}k$j{Klf;5pdU<_x=Kkc2W&
zk?$yH;Mqb=YtL%y-38NXTk5&-S`qbY=SE!E`$-DTMICIOo1R-=&QTl7y|<0BFR-LD
z1$0SEJ;c5D1|l}ljiD5y*zcd<CAo+8R4>}Z9Sr!{IgE~?TobvJGYE=;B0;Oo#TR@C
zaL$d4OQDYcv{*9zB-+WnFSb*m;1;H|AQJMi&mjJl>{9u)kFql?4FJRmwR5^>PnL2-
z`RK(YWEJ8?XIZ7c4hl%V)9jtgYWHx8y0eoVeM<qnWd<$%jJ58&v;*b6PqA3FORIix
z-eS#RrIQwOd2)qzxJ+mDti8#`iAMd*r{o+O?Yndeog~iGg_%D7BMWB@6DNM(U-<1a
z^T9|$H0-}T?hyYrj@NAa_C_7v8H_i$P})<XwdcnXc~K<-dK{~ZI4^R5zvqr_n!{Q?
z*x#s@Iq2orc5qM@VrUMfyV3$)&hb8^wSCO(<@P5ldpjEmGp&5rJ6Y=-hEHevQWs9k
z(-6q4FAOs8%eK;8aZ+Ba+PNp}Fda4o(|5Y6++FPzMUxiGtopdhLHnra=dPKbU%D9@
zb`vQ>U}|+CzMlm~!zNwAG(&J`d}mpJ<A({KL9KM37Omtrn%2%q&uu+|b-veVH9{D0
z&faK6T}RO7-Msu(6Ym?D>O75rs5hfBmiqj%0_`;1fJxAQH>}rSJi?X4uiC3Z!yrQO
ztn$J{1Jm*5-pHcC-9e6zIfNpjA*vcF2--Dm+q71+LWr*LS`nz%?i`u|F2}?;ys077
zu5ke-xBwZL0|Ud&NmJA?5vNGy&8$ZwRHaWpmwab83H)3jEbE4Et-l?}u`_nD`$>`R
zC@Vj=Cs#Ag4<tXp)Laokc8H5r0H~#%Jj9>}N~~1OBg8#D&hVH#dCtDkuIqsAZG0u^
z^8=ENsPJ`ae#_V<q~zs?ziL35hSLlV;m)tB?OFEDPBRf>S#f+b^rd0Bxh(NPK8vnv
z+p9B?xaPmcKfxpi`IWVurD{XnQfrI-D?MSRAgNJw@eo{yzusUdu{@bz+XP<`60UkP
z{S)5Xfl?c*kJO58z&v$k)i<oLjb&kryLnFn;NM=C(Ulm6i63>Hdy~=xIu1cRoahr?
zd7{aTXO^RKE*@9x3v3ijt}L1Ty3c8Me-N*;EDb-}ivD<04?+1N`a~lnTmWBNVz>O2
zp?iNf&B|0a`{KYJN!za%Z|25->6=WwIj0g8Ke`r4E|%>g^u^xo9D+clU^e+R>Su3g
z!3|gia{)`c!7@}3dj=juutwo8&mlisCnSg@Kmcek9n=N2TT~biT4y-&1rY|_tc$2M
zrEPqj)#;_mjhAjPrX>QrTi$JHACko0IhfVKBiR6rp5}1b99%saf<5l#rbTcYip_wl
ze5N3|+co!gd6#lp+J_KTaZq5!A<tQLi!+~|zWi}&;G&3oACdSj{c$|<OM^43CXhSY
z*tl67O8w^Q&U{hf{clo;`P?FZ&eIg6-)EU%RSl=r=^NcpPZ*+|ce~teaUTYdJk%#*
zUVb@+(@6|iZU6K267=2}#kZ@CqqClKphEfHqrk$~{}|wI$vk`>f9=+4Ag#-@9ydNp
zkcqmgH>g~js<8#0%Y3X&E@Fp3P2=X`c4R$K<LaF#VB4<c{ql=p4sQT|&qOm?zX$o_
z!tILvbP`nSTfJjyThrHq6OlSKp-w#jvm*$0b_f)o*CHe@oIaDTmc&JEq&-5-H=TJZ
zaZ<65v>--;A^6@EXrN0r0ud4L+^21;wopZz717tFqani-Hzpmdirf44j&zbx{j+m1
z9(&t?rIo6QBB+79O4p5ei%KjCo1NIpca`6zraWAILBXbpQ=E|14c+h4z;RK}(Y}m~
z>%v-0p?OMorPoZGyyxiGh8vKpkaH0h!dj$)lC0`ycNt6oT%@P7Y@pTsB>ComLib=5
zezm0JJOJOZ)v@j+2af;_>w5PxAT8L<N~+jV6o^FTj}2pT?@PM+NJ60PVSkS6uZ{P=
z?2J}OizFANOw$(bW=oyrr)=_rvZ$bF*RtW4%YHiI9OV+Yyeg@AOU4v?p6UD5yjOw{
zdn$T<TBR)zk*mZ6pm)q!PI#UoIPcI(rY}EI<>mTQPQE(rtXc*!J4EZrS=BEL)xJFk
z=r@Um2%FUxr;j@0AR6jf0WkzfN4pE}%|aPO+MFThT7??Uf8XP=P4b+Td581zSII`G
zGAVX(NYm-Ncq<sW=XKJ;)TOOqD~||2Tm6?acA3AE<e>%^eK|sxKev8ZJDX;8)Rcol
zcZ3U2%!xJ&_mONwgpGbvAPG{?cAgw4jyP<XM1uZ2yup*ruV;=TknrjgV1@JpGKBWZ
zTNOfU9kv82<>wmpL<zFfQs2>I@1PL!lfv+(P^l_GsmVvjh<t4s;bQv`+M#&kblmP_
zqSm!<E*w+{_u5=EQ5fBs%LsX1a^T*y@(GGb)MwrYq*-PhJ+Gh;YQUC&3W?+&CAu`l
z()~8OXK^_qW$OD_0qZM9EtJ@3jYj#<aYco4l%V{V;1gFwBG|$(RT0q=%Wk2VgU<%;
zE*RkL!H3WzlvY0VNRPr1f+HpsvjIyB-=N}>W4Mc6XVybq@#1R{D^x`PIoXJ2L1Fh<
z-(9}9zRcPM@p_qVj22E)e2w9}&dQjI7E3;WA};oks6q`wcx^?W(jrz`no;?Q2wrFJ
z*ecn~7dzNU3BZmZHwl3@UX<<p!y2lmuP}6BwAz0H!$*|B$5L}q!)l^TpSmcG%9dO#
z4?OyJT6oo8xjHxMm)%SES@hho5iYowNKV6Zq1!BjkT$H6QBKj@b5F^xFFVF%)OjTQ
zyXP*$rvQTH)&3culcpO#zo}o}$TbvSsamRHuh+^68&}I{tGmOU0jZziT7QKP_l7T&
z#jI%&D#y~-M{`AuGs>YxHl}af2!7mWz+O%x*tWXm9Y{7Yoh+k%-X<i*VM^3V*@@`P
z`Qv%zvderH5iX+}qcJHfQ%ZqTN$qYU-|J1=Ur9z7Za^Qcm8x@Y^m1nH(WEIwwm%%q
zwaE3z3hxs@IrBBRh;J*Vat(IdKOStk$O{WQjz7s5-wM(SmlgCl4@UE;>dTUg!jLN$
zc5$fgM^CuhUx1Ph$;C$0J<`pEI9)r3CBt+D;m8%GU-5}Zs6_bevP^9UVmc2)8Cs_j
z?Xja9_?Q~K2L~n6_y`4;fuh15Ey`%)n6Fwu&s|U(ES-5wNua$t7jdQRg?5pwqT0(B
zT;`o#rb|WbilGrqOryuC<O{ryi86ojlG#vERE_?gt(CE8g|E9_y%LIRWcQqnX{$}#
zFo=NqN59sw=v=gI-tCLNmv;spNAJOTyMmef!n9ejYNbty>Zf^4^3E*F#`o-73TKA<
zD2r(yE{XPwI%NLf&{AD4wdMYlv~Bi$cZN~UWUhLj31HB;_gkt;U&$Pc9`-q*&{otI
z@!n=26+b}r+!fS->vj*TaFttmR*|T=@X!67Lf4ujjV|l`5oL~Z4&ORvzSnDPh`OkU
zW~8Pf`z3Vy={>*Lr)GCEkHe)+ST5+<X%e@oLeo-QaGz*InPqF}kSp1^>jdTYwYA}u
zG_4;inM3CzADhl8RT)R!9nT&5$<)s~f(`W}wv?CpI`ZYydJ#PJJB!C4UT}5BCrMu=
z<J9?LN=2h%rw!P=F+ArFhMBqf8J}p4o%OU6+_`#61xJsW9O&6Lj&!DoN(SR<zw{j0
zy`uFgC~FYRj>nYjMvdLcez<D$gO(k}SkY!lD#K-V6Fu(Tr6pL=S6ky13O@K~l(|pF
z^{fb((_b_?BQ}Ih-}ogvK|-eX6rIA|;~Xy3{1L`#qEYws2y~Tec&xQ&tN5+r-R%V}
zO&(K{y`8S6;R_p=4hRZ$>LlU3)nU9ZY9c>d<v15F!rYf_hU`EgPov(K`Qp<vSHF@K
z$9kELNkDtpb(O(hJnaapFA?Ll0l{X2L}JAJmyR}*siiIszI!n1V%0-$j=Qkj7M%-O
zF2BUc^J8a0O#a;A(Sv`pn_uG4)1L?I!3C3V%Th<YuVZ7<I=)@_));G>SP3rR^@cx1
z@sE2Ww?kW#mT<Lm7<D9-yyx;FeIl8aW{AXoK2<G{t^9GX>slzM?hjm*t2^m_JR%EU
zKWvPu35tC3<q_f3uJyOGZX_k+GOro$ciw&2&uf=Tj^2ONFMdY~-N~9kM7QkASnqM5
zpX2qZ$X1s{BhyTc3?18%dV_JLfTGF#r#%PLC$hKNJGzoeuM_S??0v7#py{{Up6=|a
zNpZ3fuChyG=KD3DKv_)wehVf1bA50|HshgGTs(iPJRQ5yulXw$5%V6)3amZZG4q})
ztv&7#SIk`rB*W%>Z8*?hD0huPskA3H^1K%~U8_r@hO>|2@_8?^&t<45xh)4yC!IX|
zP{Oh&^PBk^t`6<Q#2Ajy`7tEVTc}JmDPQ*BI`(bm!-`4A7(4<gP)nqU(fK~F_m!#A
zR9lwDLl+%08K3B5krx#LsSFy5Wg+yW$5pAh>@;m8>tn(p!X_hU`TR71Uqsk52+uy}
zq|(3O(tf3VPK2G>ZW<??v|TjDB9VeXSHSS$o!M1GgY>Sh`=91P48QdH^d{j1C))O<
zfKAKP8CZ8`)KyLKDQ%oZ2ql4TaT~Rwi=LTGd)8Dh3EP=IolX)C)ad7s%DkfF&nHJ*
z<bnujCULc|I;!I9DpvmT|G{#L<9#TKyZ-n`JRNm)^&c>$(eb_A3wWh*ISFY>L2@Zd
z!PWCN5;ej|=`m@cP@fs?8KgEFDJXpK{fkrrQk8{hGTiDU{;Tp$WFzElA`7?FN8aB_
zyj>fSy1P~6F<KkI#A;aevyCS3%dhW{D83&QYTn6j#Ztgd3~qWw>uVGg69J?L?B1;c
zL9BaJ_mOhL;K6*_fdC1I!=41!%94brk)k7sUPN9bLb>%3?-qFEHU(0Nq~aCk*9_H+
zyGR9OWSHL^A*T8ru^vbjx!-bPp5%Zh>(p(pQ(=>;!G-<WenH9T<pnM{B^MDk@(u^o
zM!8u(7rv4KYk(r`G}gmZon1pTwXZ>X2u@udh?4nr0)FWu-q!GG;|T}VKnYq*+2who
zZ#zQh)0u2o{VWK1gmd(N&3ZwN(X0O$EB{z9HTtmo#aT67ABp1QVCjJwxbSlPAref)
zBu*;wX><cXe!WB5owQae4H7?b2WI}d#Ax~C2$l{Dm}P!nU*LHQkz<ZaiPqqh<{58m
z_#a|2g(Nm*9hyv}U@|><mIx-36g-Y|w4d%M_7D?*kMj0Y`Ejnnx1CJz<>*{w4Gr+B
zI6UlkJLuPr$gw7XJuoy>QB2_d6^z)xJ+WkeaLWB)<2p}TvxuKaACg7gw@(sLBn`4h
zF|N%2w_kf<Tw~&Dsfy|(*TOjE#e3awb2HIlGJ^&4OSFNm-W_(>g%Ym}wlqQ$!REZI
zOZXH~L1S+r;apoodlo1ZEP(soNKLGOcg`stu@?y%VlQIIJY#Ul4X>z25amWE^7}Ug
zUY)qzi>|w;S~kg?rWYI;dS07IXs$Px9pV<xX3K$BO{P0u+rjt(g{ij0lt<Qt{ztGD
zfg&HsiFYBjlJq3U*Ox}QnN?rhSLudsDC}Fy6ZR^u<~YD!@K>%x-D23pI12Ra($QgT
zpFclf{0V5S=4ayjEC{+2LR6nsDznV3!Q`)#6~B&;O?neJ=7#G_Crp_Grlj}Uc`#c8
z&+eS}@|pv7x!sqhGx91;DW>Ub!QC`S&e?qr5kzfm3Ey#pO6of2gBz~c<NC15h-=vP
zKdYDo082S#+N%PpO9u!vb|L;<)>p{mr>Z51sTZ0koVoh&{!mXs&}T%ATdy7~k)Z#x
zi3d`W15)7vB0Z)HxPf0(fnOZ;<2ebah3!B9O(k74>3r+=SmG`G$p^&(IxHL^C|Mr&
zWvjmif(}#OO?<xsE@;p-cAddXLLUB(;Xy$GkvANNHZuJz9lzV6mIVhJDelN2AB5r#
zGWZ+SOgrV@k0X{HyyUye0E=@_>wE$Jj+BaKhgkdF{nziI0WRv>&RNk%W4;m{@SLPZ
z0R!v-(Ej5`*aH68LnZ8Ct|ImXC?*#8yVvFgx!m6FJL?LgRqkm3mRbH0sfOkE=NkE+
zFU$O=FTnq!^~Am5WSd|f+QdvoQuKM5ATK|O>&Tui7m&c7VB_P&904-_oJ$GpDZ3)(
zGn~s|>2yIQw>Qk4Y+Rk`o`CMnQE)!6r~mmHfE{bt{PjIeI)JdN#7gTGF^(e82FpbO
zP^bYwPPC6F-onLVZubaQ9wiX1<3^Vd5G_K9nwLvAnBLk?sh+uH<n;n(H!K54_1OP>
zAL8DUJ|`L>>NgQ*_A(snI3Apb1W=*ZHEZ~4Lf}=QC(W-hg7mNt0Fc1=KwjD%@S#J2
z9)&FDzblPFG$s;|ij8#6Lkt8o0@!%oURXLLg)oy@dX?xn&{zY@s}1@+7)jq3D^C6o
z&A`iW0I7U^!U+uP4KGcaSyc2Yix<@r7D&ddn4y}OsuaVeb6-}KWC>ZYu7YGu?g*6c
zLyrQ%$y5Iv&msvNkD6f@DKODvFN(sS(az+TgneltjETg$p&b&xRnc`~99apUXvlmi
z9_;V=^j}T2Sv@N!DY2v?(xjTf%b`=4T5S165yIWaPGQNug80Ye*=v&jB?VCQW3a`?
z_Qvek#PY=6&_wHuh=cj#;%F2QY!2Q}N|&5<<=sB3`cOKaN2}&KW%m;K?hR?5b^9r)
zpsE^r;NOY$=c`N5C%|%rxG%fm@p(M+4k<}`U92Sc^*9c_Qf0LB5*HwigM04?hc+L7
z7(6sV_d=x=jNBAl?NaN0W7+&6m|oVeNv4W1UH?NTIPc_u`$gRqlrYz2e0?n7oJW}R
zh)aD=Bam6&=^TceU^rYO?y#8VGwLrgd*(3z^~OEu;eE|qz~&!bM1l>ph@gBCONJrB
z3x!?)_Jy2seeYL!6TJcO=0D&9l(`3+LoSpy9PlQZkJjhU424=_DtZ39A9~{JK(6#8
zG>4>299M%731qY*pc$D`D6R1ME2mu%X~=-n62~j|`RSRv>qgh~aqe3O{U-gxTgGMu
zLb*Vmz<^nqIIkx2MvBSEdly6|C?<+HeiS_gMQ5-KE{t90=5d<WE>e4TR`dt_hseTH
zK=zMo1bY03?2yl&$dAqCS4Sfx2h(SN%x33@PHWF2iVK_3DEDOA;QH(rB>?S`x7@S#
z`PZ)KiIaf|o)}qUf*Xj4$kxh_K6>nge2Q$ykAnKM_>Rt{K(W|Tq_I4vK`_U!f8M@b
zbVDBS9?}CzA^$KI=DHpuk+P8>&cly!^dv?v^euys@pC`rgxn1luKBxUz(O6Yj`<KN
zLe?H@;p5alT!8tB1XR{Full^-$HsmMLdpo(@YETeYj3pj&(7boMyx7q&PyJ<54T{z
z2b1Ui?nR?4t$c%yok9@I|CKrLlf*-ijo-=(s1847z;%N(@bb5Sb4>s(;ZD5bOlV<<
zIa@5;X{-%&hkGC?lX8uEiRBlGil4%Ntwj#xp6u1YaHw%`F1$7VgvT3{2F$tKAx(6A
z;6*#UxA&{#Km&~SlP%$R6l}3xX>MD~l1t5CcKt(A@KZRLvezc-M}EI0)BWvLi*isj
zt$bpD(#N+vhlo+IhG|6~X(Esc5F=5RU*5!dr9J#L{_2p3{=1;bYq&&8V|zA)J-zEF
zPm4HYrlzJg09r0hJxwt%L~;%ZxOt1Zt~X3~rc8iZ4BK9tOYmYcta2lviv-&eA*28P
zBZXY!@Ke#>FRC>5{h4^Qn)sgPJqWv&!&sh=A}6jOd(bPjzI?P9!l<p}zSM6oFfkqd
zi@d@J;Qaf&+E_8)K&l~;u?lFdT)nbr*QH9q%OvLfNICgikG)5{GG8Ifld50gWCsa<
zIftuGf06tXW#B^|@%6)x>CL@iLQLEZ4m9c`@GkkPhcy`0IGHue6V0tS(MT~BK!{k%
zhV1Y!{dM&HAhIJ`>1t*U_oQS5AVw5{AB+G{ZHnPe(hkWe0p(Cy^U)7qrN(Z&4!0bJ
z>7~MW+dJoi>Ha#YWM<%MQKe8{3|C1=U^Hutp$986MGqv?<*{`{58(i=vX*x{*QzfM
zl9g1~|2b;RPsZ2a)nr8;TOLX(|C-0g@mpA;fzh;Jz+YRPIbZIzBi?V1LsIOZ5y@|_
zR|Ba`L@06L?r6^=@BIgH5hG5<bb{Tr!wYr8qrLjD<Eb&LKDzkn8Qv4D@NrE>4U*#|
zyQq9AX=8C%PsCx`i}GKKcu71(d@b#O>vWi9Ht<*H`qX;!4Y|4B|9t$M?Pk*+LAoCt
zQ(+BrlH2)hxn{r+b93sIp4vJ+|6jU1+|}57FhN#6ANHy_;_ZM1vp@3O;F6JxR)XSy
z4n#+F;VQonx>2>iZPl1~rR?hFm`_*c4MyL;Pf8OxsVn}L7!G)+Wwbuuk0Ngd8<D$g
z*eSu4MzI+*v57alo<_QAA{!Lo2^>|*)h&?{J8|WYLqfJ_k_cSWt7Cuh@SvX7K%o_!
zI&f_R%t;N#g99<bHh46{6nFWq|8rK2&)6Ik?6<CA0<HhB3;kzM=Em=RGS4Ch=ym1z
zS(rTBUu+SHjNer+HoxI45OE8Puj^RMYA>Gy0-+f$Hx;<@q7ht)!}Xm)u-pFJ8T5qv
z4M#w}ME?3wE5%n(#ia)D?Wk9Hd^AU~3C~_N$BG8^&CCB)nn?75A07qP-@8%;Sv8)T
zl6eucs|sw$s@#(;xb61qlU=upV0rd~VtC>)DOmaexY_{2WDcPb6kybb{);{XF(H4S
zx-^`+z_zz8W}g>gYY=-28Lb34_htw=BpT&rl;hC9>OO8+gEVvGjaQK?x$)N`<A}??
zzx|XN<2t;%CNZiz3Ly0N``dHsuOyDf6bGR|UekMj$<}H{bR!f<cD<H93AH57uk<%*
z|CCQ~#j?n$7vIgqkgeCw@z0^qtp#eGuI~P<?aI;N-4KkW1iuDR@K22Z2l(VRv>)y&
zGE>w1dt2JDt<Wpa-e9&>S5{MO(G64p<#D$ktxl7ShP++}GQVyFY0Qg8oe1~NK<>20
z{p&v@fcc5O6xmUQXC}BOKnKCc_xE>fU4DeirGoh8dHB<as_c0>gI5jA7h}t>+J56b
zJyKb5`@tuQSASkUkmWlGFo2FH%9CL>I<7S~daz$MKTx8CjJ^WiaNZU0jfb#@n)f-6
z%@1zRtY*Gs$2iRP;Bp?)@qgbh1{CSLR7Q&v$c7}`@ZLk(o5y}q*=6@vJyO>5s~^WQ
z<Or77@v6;n{4G$(EmL5Tl@(j{5&pdt6$pcOnY<PVPk2e};%eKzD-x65U8Ccdr4Ot5
z(HLpk^qdvCvR)t(T@B;2>gDL*I!u@RBMOMkKodOZy`xEm+$6+KXauaos@!)JhtZB{
zo(KDjmbAH&kca&I>>T%~=RxI<_9R2Z@<jAHmQ{ztzh17U5*Axr(9;E{8E%Ie%X#QQ
z0Ii6F{OW6KFv-SKdo>}i=!W~0Mo`sbf6I*3;Qc#X?Kd~GPy8k3yp-V9ZVwdj;Ul}E
z_rOMUEqL3JAl1}i01BBnMMgSAOcabjg7cWQ(E_^cF|Ib~iD#t9S^Oh<;l^R5p>jR~
z%t~U|h$xE_dXVfe*Q=5Tne)v|z7mk)4p?%}&H~&`4#C^EG*A#|J6ym=T#Wx}UpSp%
z<n<N}G2tV6;*)<HWx+nw8Zy|#Z}I1cEBdfGvTx<`SmjTj-+&OoG2DMIMUSM{NMA)-
zoX1?<JpStJmuAgn7%UsK*Q73uP5cuF9MezO95jOWo`YMNKmwJO3OIim7#E&^sC?|e
z*NQQ)pV)XqNng|Wr(n~DFb4I?M{8g6I7$HC#fT_!vbTW9)dwD725J)I`PU&G0X!xs
zUn4qIdyFDA)))i4b_`^+uOoALknSkfw<2`GKbJECtR9bL4~y{qF)yHQg`7@v;=gwl
z4ZAX46U@c9yH9R-Pc+9gYp98urqgt-(Xs$xRAAv#aq%%~tG*Co(7OG=&tYjq)QMfG
z_0Piw?xp=t4K?OIiC@$Job^XKM6P$YR+`^E`2Kh>Fc+w!`8cZW1uJ*eX0ottU!Fch
zO1N|kPOAQ`f3Lvh6?9YeU}Fg3!*LjC$i~Xb%Hy_qKT9)9{jpVt8@92U5eyk!1~_a&
z(D1|WAxP;>odHwU-?|_g(FLr6%y75}*Ob-BopZl{xL8AOE3I~&GE|b_tz5qF6Xr8D
z_~Mc&hdkO?<fIdEc=CGS2F1T<>WKF|1QHv$T-px(Er}D}OiIxWPe>S39p`%O0I4tz
zw%g_NKOwHrRDaIX+R}d1eM13g%QzLKdF(H$ssLgytT-toe#itC7^2p@S1uDo#m&WC
zxk}1~#LQ~IRZWGGP>=Ry&=$>FhVD#_jg5%0tN`u>Z5~Ac;#&N18BdtOc6m6<Ig9a~
z#vXZlA3NCu(;XSW<#_QNFcKQBW!C-YAvWb$d5qv1zy=(Sy3$2;CClLFXSl+&4^!Y-
z^>_F0xcvNZGVNBbuFeh4HcSrbA5JBfLj+NejwS`>NcXUM^;UjcYd4UJRrXssVNvW_
z@!}BJ;3slp1u}~B@{gmow4CY7PwIg#;=@M`J+@4@p!zKpAnqt^B_LJCD-0Mf@DFl-
zrw)QVYFIG^^Md0yYjkf{m^4P%L77Z{g>!xetB8Il1W>-0SoF|%$C94rMLh)%vc3mG
z?QV#pnq^Rlej(zhh=?SlSON|*j)a4{_;2fx_$TnxN5{k0ln}tvX4K57CZZGhQ6wOu
z5-YZw1|r)8OGLyPn01B_0Xw*id7m_aFhK-#Nlrh}?d2DKH(aL(>8Sh9F^xGvSiNlH
z(gKF`AhPEkdj+KYt2vOzqAufs%#TN25|BvLRMY?mbFWObx39hqrR*j#LBFe1B}qR>
z|6?u8s$-!4{pJn{38LH`9VQ_8CBM9a_?R7l|Bsh&(g5E(d&A{6zP(fHwoJ})UBZFw
z@?HX_f9F3t10w;bL_+GfhVk;j65$NzLY-4C-=58eF<<vSST_j(`ALr<At8W=Y^`PK
zmF4NAFqyTz5W?V~{@B}?7HkV2!>f=Ipai><=m(TW;%EmOR6bIG2oBc0`V+94zENWY
z6OVD-lUEPp1#H-!U0s&HKKQ@cZsWkO7w(OA+aXl#eVTL{NG8Xydc-yaAX1Egiw}IQ
zt3iQ1u4DwWgbRUsUa=3C+US*o^Kp2d|Kf)~vK*jj>Rw4)>h_;GHV%aF$!H;q6sv&y
zEg*g%V{0azH+`JyHa$$hwQATYIQFlb^pY?JOW=a6c()>ES3<-%d^TS}tfc}{`auWj
zn<o<UxYGl2)Tk304xKyTg~f|cG)3W~s7n6~9s#$2F)9^}z3?y|%m!eK1T6H|b$<dH
zo||0Ilzn-9(6%0xmPLVend_46$kj9amYOxXz=L1N@_sagGye93{y2Is=qJ|qTZ3M{
z@D>UV&j?s+r&G@iu0rLh+o><l-_R3_V?k+PG%)0L&iw=d8|8QEF!p!Tf}H=K7$-(^
zLbEc6Jx`xFOc{ICh-3W*7V^+3>}{_hwTPF%41a=E1YiOxgNF>=20$8XPjy*!3QeBK
zncDv;hmaN3LBzcyJrqt)B)y+@`zOLETDz%4=g&qe$l-HdGO9VEQ~9VQBLV_b0^byO
z))($pjQ2q1>JO17LC=Ml;>ih_7m!<#NXFd-S-8B~t>zj({M(vLzT9DeeV!BmO8)Ox
zyObtR!<W3o)w1l(In#N$9}qHsu6N`pgu3zje&6wBHb~rKM60Q#^IUFO6OaLM3V)g|
zw5F*cQ@hH*sat&Mn_~ER8RMde9=v}qMV|-nIF8=QyozkiYw$Qy#J>gnoCcx$u2coG
zGQpU224a41q>39fL9)|T!b4B@54%SY<U}6+;TDaxYvKj0VHz<YYd@XNW4(E(xBl;l
zI<BS;pdoLv^CG}6`mEEM#r1FEMZReau)mbf8{E177t8(kX#V$@^Pl4EPz3$IvCeJA
zu!xBB%`x20nCuL4ke=RFdF3b0|LfYIj|TV$Rl%9`n6t)Po$fpah43IHut}$pejT}b
zg5J8y8h-{D=+7eeQ_z>m3(r9H+N}n=)}?A@qd5y6<O3nBW~G?u=Ur(sa)p=TnI3E>
zM*kt*i1R_u{Ti_E0gsVb{?btbo!NVX7xyw$Ro+H---d6dFTfVF=#UW0GK6;I1?a@y
zKOZhbl;ohJCgOL=a{z1A&D!dM<q!z*^N4ruYR!M_^NAe%{^30TED~|N@CuAFEGH2a
z*(2V*fc8>oxR=rS$4%7b!taNi=qM1?PpSmYSHX4ye6_^CMCWfh9lrs;KkFJIf;eZm
zNXFDFQ8f}`SqyM6q=kbPe8^$`CdLOycBf2V?h|ayyAF&Ug`E9#2<3#mgLerW+$jFT
zhxC4k0MY6VQrh5o2`ofv8)bBxz`M%ByZFvKQDa2h;nEUA@cTNOz8en?PYu30yV?R$
z(if0y^7y(a=O1Rw-**NLFTT$30S}XcIl#nFktmCCG!J)8{QtXS-W7Ls#<KnL?%=_$
zS?D$2Z1vfut9%Tm@_saNjMB%>@c5tTyBbq~!xSNfQb6g4+eOAj1S#L}zfJP7HR@n1
zug_qry3KN_sPAmM!Oqsz{63d&FZi#v@94M+o4NO`TqVR1-ce>Cj~9PkqBX*QzbX1Y
z(A}q840Vjb9R^qYMbJpWAnk9m!{m?-ukZyB&4a(Ft^j<8z~8h{!-?2T7;^Ti0HRy`
zi!?$Jruxk%LO{*qhctH|F%%pBVv}1yEv1LVe6$QS$j~AGEdnXN`-_%^5c2#jCB}gL
zf_*SdQlkL7Vg8HY^nkp-{^q_R;E@N1l>81!i~YxL`498K4oQEr2pCzu9RKHV|Iguq
zto=>7F?S-hV*9xRopOy)j!kbvzFo|!v@B~vqe?9aZT4$hJe20E(Q&zvwFO^G9Dw%}
z%WN~x{2{J!av%=>@#Z1I4<E=<+d;#Ct-PkiGNE7AMOfE@?uS0FRaB-rBkxXf@f)qM
zf!*O2F(q1^1>5jioyA<C-UoquL|H<;b+n<M^3s5S)zR+K|8niYvbVz69`xxzzLyu?
zoqALG-Z-_En3u*H#oJ<?acVuJIo+27xU&3~RpCot4*sn6@;@}21ii*l*+LVx&kzFy
zUBC~333bMK8BvRhYwwaH+b)=}zHw(z`z#+VH5)xufUDC~WN1P0$2(3y({%MbQKbq*
z&-1`27S<v{m97^TQR!J9adeup^Wl+f@!(Essc2)CpO@9E{(}XuC;ZcRXovxMRCPg^
z5LsG65)KouNssg-vk7b9y#`k%9GzuV-Sv~On;oun?|&2`&hsdEPREfs1hM0rz|l!_
z+v&AE!=U7{ei`}4ia}4NL0w)y^bcis1ZxXY^^IjL^%!G|1&RDfYFq8T@{H}z2-J1p
zE6OK5*!PXIN!;9!MU|=s#dGl%2?ypsYt4}~=8U0hwXWpL-4VSc+V77lEr{D|Z%L2q
z*W|S_M<urAJP7)pxyT*Y?*^lSQ3<g+yu4P4nR120>YQ!|8F8%wMS^7Y1N|)n@o}>^
zpA`1AOvd#Gpw`3VfW&BGbyn(JG+daon>|ShmkqzPtY0e9Z?Vp;`yA)b(cL=l&Z}pR
z8e|Hc+W1h3a=JbKE+DqQPHOCa3pHf3vVL*B`r}@}DDZ~&I-z}-q`htr<iT%d*ucwf
z)PKGe_hMYX$7>sM|5N#!BNM3f8hV|XTO@8nDAbzzuA6^ciPG-;P|)|)Y#eR!RTj6O
z&9i}bU(jxjc(%^J2_l|U<<)!f<EzyzUm15ctc2j!dE3`X%=kLwENcQq=cESNHox|^
z+*9n@x8DAdk2m*#;#)ocYXSdfX>TWFd9R4IDtPb|;j>F+UPi6!nd=?rh>7bzj-uo@
zap8LKek0<A*0=WWsCAetBcOk-!W~pvpQ+~Q-x3?Kg)t*U7Coq+0jJhME}cotJxW2P
z2v?^{{^_MSns3+egvtGBpPLLCHSlz_jL1JV#<hKOa)&zcb#f;~0JH21>@p4dWMW2b
zzP<jrKoYL~4IhSJ*<lC4BATvZ>H3#Or~p}W+czrYimVOg+&*_#<$ZY?owm}GZFaPJ
zSas-0hxY4&=;sVk>!#xgmY%^EPSRSS%;NilEN3&qrQ|M1aoYgDoDzZFX{3d9IklO8
z+WVn^!4dK}3IQ@OPq`3nIg`~go5fneCllQij#{@FAFV$j`_6neA04>F-Y{)~(jYy^
z4Zk>blQu=kIR26!J)Rn0&^3*RS>_34Wa31<T&D}o6?w#QI;yRH_@)zdlN-kqDoX}n
zc@1N&ed*DrK5DN${#87ozdNv#W*>*jCO=Hp@j9+Q#|JbEwz_#I!)2bSt-2><A@MhE
z)VwOq4L@T$(Tk_`=;FT8+O1cwJ7gro(obj%8fKeMQ3S4AYLw`7JJ}@tc!8J5mm9O>
zs&Ia_!@+{Lb>17Vf<$3miL&rLF?CkTOfR35f)h2v%2FLpOuKYFqe<hk@skhtKtxSn
zc=NAtVicUg?L(YM0f55-WdtWQ8(Hv2N7XOq_Vx!BnNwfNZT$9(2lwS}MUdAktMg+W
z8~mKHXM5R&ekxru`Bm-g{E7N<CF!6=E5&gd6*BQtFZDk7Vo~^=|CU^GlbqPR@^q9L
zn`|R(*ye@_LvZ?84Hj~)fu))nK?;tA#I(1)@-FNJOV@ellE#IqFG^8}cTDRQa6KKM
z<G{Plhb5O59VeRbQBot)V(<AFPSX8`WD)_q;^Mr}pvRFXTdfy5gi98iKLj3WJs@FF
zil&l2<-}+bD9L;Sg4h3qyEDP6`euCS#QOnPa-dl=6w~o__++AbH-7dVyz6gido%eo
zDXw3DB85JhxwLaSc1(ovTbRrTNylyr8w-@~kiJZJ@J+$im|nKd0cN)P`2M{>Poe-0
z=GcDyFU7A_jH!8Tl2>Nl$3<(kT5r^;u92nOx|KaiPu-dveu0geA@vfgwiChANnz1X
zPbfTHkL$<Jws6B291UrEom^tg=YG4E+%X{v$7J^$S)s`zraChiu(i2L@$Vo6FF3*&
zgp9W~0=r92uzO>I4rAJssCr<m%Ptz5pU2``&R^i2zJ@3E6LuG$Ja>+jFn!IiRB2Z^
zg@f`!I;oQ5J-^FkT2Gt4id>G0{YV;hzahdhg!JGYb3*@gou@g;R(26!Zc~4bN7nTi
zHOh`+6S>+NJ=h9Gts9R8#GkO~uA3%5&Mz`K`K3ppyLf?i)8$=dd}&M4E#<kgO&b+Q
z`4E%~+b?OUXbvv_*m)Sx5gD2s|A)$;AA?lSeFltsQYQuCgL3$}22-~mNBm^dxyIKP
zki^hJlb%A6N4ODxWuPr&xzr&aXEH&^dxihkG0vg9xAY05r*z2G%@qsB$Wh<!o{)=5
zrQwfIaeus%$)$Tv?ps*;wt9cIG27cL?~t=f9Af#lthaIVBO`JMb=73M%d&i>DRawG
zX1-{CWDJR*n=1{vn;5=%ltC#^)BwFhR`~qh=yl#dsYe^owSL)$4eBX@TMDMrU$Dh-
znFzC9D(N+)Fk8+jezTbRfrN#6Ety|#H>>b5Lons@K0cGI^v<qw@`st`e4og4O8G!g
zsGP~=SJyv7tDP^y(B8n&PBXNyQ8&JN?xwQoq^EoArn4uTfaeYn`3=c^a+~<a%5y`h
z?79v6iFL9-D<_ml#a&;IKio+bj+HHFFofYxiEeJ5FxP0bDrNGIBYuZi{>J&;C`8El
z!<H}47oW<00WC?eHoD2`pYqH~Cb02!oL}?kEY{hVTK^CxGuqv*P7!kc*D*qs2NP<N
z(RawQm8p6gTB_vjz1;jcEJsyP0vAaH(nLzIm_saY;0AES#P+|cYY%;+y)yY4=Lnwk
zOJ!>+=sGc7`8dCtR9u5To!{XN`-d>-2#4jNEXZIIXi<N|=8tp5q(q9pTL^&GB!&=A
z^t(0f$u$+K!eYT9i@@4Y9^PE7h&(r26wPsg?WUIE%?ia!CjCt<eDRDoXUtHwq~ceZ
zE~ni>+cWRDT)M!5YT)_QB9ai-Ylch3$ycrMY5GMps?_t-BaWn-*s7JJG=9?2V^3bz
zwkmfAq_Njye~dM1ST;Lvq7yFUMOoUndglwpTKKs-;=<I(XG`r+DKhBX^M^ZslnQL3
z+w+bCNCct|0c`mvM&F47vi&ggk&BNvWLaK|PbDfnTOGv1LGD*IMt)5mT%BT}3fZ@r
z4q4hQdovEqs=rp^ade6aRj%;vu-;(K4NrDdA%Dfv)A3yYgI!EqqpoSY#M`t3!aS9f
z;|nGJO6X*ZPX;R1<S)6Dz54_MlNvQkzfh>KmKqu#lZtj&{<SB}pyaj_3b|U?)S<sW
z2576`;7Zj5Bt&r<aogT@^T4c)un7^~E8-7Y?zYbF<J2b^q4dcHZOl>B{o^spQqc;5
z;q(JqDPkyDp)%QOmW~ch6Flr*RgT>ewgTG%*}_#dWoT=1?*>WUd(F$H+TSf_J*7W6
z7#}F_-v0JfTqdDv?L{12*k=D{RH@KopMmmA1^PlJ78(!U3j!sCrCIa?B^-e=;r}5e
zT)l^iPgayk;V%B%@W?{7yS3%4ign6cQ#Yd>*tA+pj*hWmh;OGS`&Idy#Z?VX-su0u
z-g}2r-Nt|55fO#TC?Ta`YZ%!p6v;Z)vCBSIg{+83k&)3bvX6PJIAo7xRKl^hO4%fP
z{@x$zd3wIr_xfGGe}C8Y{io}>p5lDY{kiY={T{F3i80UY0DmY|g^@=A^4pfQs!nn@
zC{j_*xR)|hi&gI{qh8g}ZEB3tT_T``)0@hCA6CvNUN+r-L$Wjdi%vM9U3sMU;S*bO
z-8SR8m3lZhGh!LLz_pzf{<}ye`LbKRe#mM@V`_~U<Or%(A{oZ3DPM@zJI%39nkZh}
zOX8^j>l;4}>wG3Dj&6``ppy*(3)jzthsxDk9w%qwALyL3ZoaQp=f<Xe&eOB(Rgm(A
zyYh7gtm-$(C%-aTGE_1WLuECbS20Bmv(qi_6-YX7j)xvS7Czxpy{J>5eA)CBGuq$G
z`X=KW(|QeuLRq=WbxbkL^OjZv2%V7I{F!t8*&lG#@7>ub|6KLgaIuL3D3qPhC7$yq
zc&T6NF255J*zAn;-lnq4JVXm2jqDdEg~h@HqxBW!xz{bP`1{b4@?tcv?{!vTFZda<
zZhuK*Wv=p_kYSciSMsedLzIW|pB~7<<l|m3Gt2a)-n6Cbkd~W#B^IX>ch)A;ZJX`9
zqMYPV+eZls+ZE57vC)!;hbm7Wa<=*uZG0?tOjeS%^>fY1z>3s0iI{G+B)``b+;+1k
zR3`E4sGq6ygysFZHR3uF+sX~<5`Qq9yH~w~G&DK<5Do^EFvsA2jF3KuwUTl0^_6+L
zWUkfsvH8YEX=N$BO&7nWXREARP=%$PxzSnrf)nQEjH`)>x5~}Ud1QWk<~MDf5$D1l
zEah{E`N~=IV)F<aH-F9dR{`S%1^CA&axHC-;Ldz~A>+%9W|Ob>Z!+;tuh_V|k0G;b
znf8W$DEY+#`5^g_F(nP(&6x8>&%0gsUeXIo9-_@UK;dCDay{hA2}}O4;8B&y6p4k9
z7u=?;C~L-qFCq9=T8AH6B(m;j*KTYkJKe0H>Ii~zYIeUb_h?^H!GV7>aPK>i&v4Wm
zrYP@xQ#j-iCaI627EDi_CJ**#IXmKX#q&A|-?BVw5BCK{8OmaRCBc&)?4QVu&?T=B
zLQ^sx$+wAx=>*mdVb99Y$F2{*BY);-T6(7D5bjMrdtM%3*$Ygn6Y*SW`}bXK48H&B
z)u;Vgf#L9^o{YDXbFHNX=xnUbi%{2%oK<fghe%X@Y$)zX{=1^tw7&t3yhtOdd$ItT
zE<8r?g9k);0y#r2yuonE%Xu)7U*MOysULQWBck=8r}9C1D)R|#vD{f(^UY6Lh5>G-
z!1(rG)2>SkA$qdsq=%ids{sEY8($Rzkc+`CwJ~1Q)+9km`f?V!-86{sq|<n4WwP4w
zamu$!<l;q?dsyVT!zW*Mw(Nht?@25B57beoK$?8Zp}Hp(C+Lo@CeG$qnNu|K`&e4l
z<>6IgATk9~>s)t+OX`|}n(|+l6wMqo#)@fnHrP$X#VpXSN$<?!c?0PWSSZ^}JFJ7V
zTF)@@B(%~van^W$Izmw45My_2eJgpDL_8z2^NN+uSon8`jiDpiai@I_kkV(o@p-Br
zfF&pN@>E9Y9>0VMtF4A|tjB!GDv|GAI=PYcsDqeuP#i4R#vdfp@5r7Hk^MutO5aXo
zN3e5gZ-7Yzh2}%}$5bv;|3L%GK}pIXB7ANKPSJpn&yEW$Aw~KZ1W#Bgk@Wueg@J^I
zV(626xid>}M)C!DBwv`|&Om<MH((5e?PV$bt8f26suga+fE&&Gce)t)04r~l4+3Af
zJ>DzGL*;x<yYS~Sg5S-Ha31>C3xH8X^Y|G&=u?rv4CL1d!vX#F4+wYnE2=s|8ju)%
zR|a_l-ZrrG`D(XLAP?mU5A}2MlJyQ%W;dEW_yGTpcUi&~X^r1R_!LRai5bX=pn|^;
z9?s+X>oLe_fmV_-_AQ16IYz9<;d5--yUCD;`oE?Azomvv^MAYg{{qPW4+98=E}+mL
zAkL6GPWN^!$o&^jp$MgGN`$+A$qud!MIi=Kcv!w&jO8k&4FH<))A+u_`4L{)gq@&^
zL`Ni?f~)P}b>3v6zb=X05WO3Fkz`mEA-D-7Gz<UT&4!=d`31lKU|oWLlQ%2F_}7n-
zBlsI#P`{$QZ5g~HHS*VAo`rvJ*7l75Uo;r2Br%&paKW)l=G~Ipt15ja{><}tRfGQr
zVn_8k@`F7%{(tcYta{$SpXme4+7u9A@E|*H>V9--cfQZ6MeMb;ntFe}@to#F**G@3
z*KHE_LPNA-V_pfwbcQ%XTH~en>fIY~x+`*4)PgtuP)Q&?Pey+pFrKt3JvTp6qs-l}
zIIRN3!*-;`FA~=!kcD}BR<T{A>&n1oi`D%eAZ8mGga%Us93UG;yBfp_AjzP9%(T-+
z*yF8EKeuf>Xq|~ud>hq%&^zqVOA)7xf!p=W&Y)h628xX$))4g@G9*KR@O3&toS8GV
zrM3XE)mSj*=}X*vNIec53yadJd?md9he=PsmF1FB((4eg%D>$D)y29+XUlHMK01(|
zg$6PW&VHyOMW-XNlOy8;*$EG00KH^q=>g`#2G|K!lNk=*5#1ot=c}A*lQB8JBAag0
z65Hq1(Wj_0pPTpzx>>pa<mc)`XKL{r2O!xI+s_4{r~E1)Z^!5nfPIHs`mCeYu2%L>
zQupf*U3scTNarNhr9=>CAeCwg0(zyfx8EPrJ>mbG<M23KDnC?iQS$rCTT}55shUbS
zQW*0QVKsN1;??ibed(#Mqny^i<R@iPe<v)epkCO>Jl8gKt(&Rb_7sJkY@1WQq^KR)
zAQ^)G(Ll!i^I+xnmOFgRdf78)pk*jM>B5&~C^Q@{mjsVmy}UEK`9L1+Q330Sa#+(&
z0QK1i?T6!l;+q?|H`LQ(5ogP|nL;M}n}DZJbCoVXEAKs{^J0g#9>~GHxyfD_DyP9S
zUR@-gJwcSn;t9>VjzY6MjrOWjP#y7=1hI(sN`T?X=c`=|CzZax2$YBeT1$aY(5%<?
zx>28HQV_ZBMT?JC)WD$d08TZo`|!?@@$RP3cO8us1X$B#|Jyf<6CGUm>G&_?v`X=S
z=EVbqy$^tt0Zxj|yZjfaAc|xPT~xY0_8?bb=qT)2JkU0^&`0vy$IP_9p5wa$O(cfT
zAf0a32Wix#Id0SuAaz<Z*BioOZ}GY#hT+gDwX1u+d&J*qN!fola_%u(0wxX`WSDIM
zn;B%Qmv))Bj;6)qCm1i_20(is|2+yr|1~1!H_DEYE{)eztu8-S@F`eTmKM%`Q+m#Q
zA?W?whxf<rZTf89C|jYj_hAlo$W3OfuxX$GkT==;T+i-R!FNAUxbA$rPxsdyYnkU|
z=f6AB{LdxPq5KH?dfDYm=$wNL`VQXH#b1N-?5HCZ;zNm}=*@tb;S5^QrsCp?CnxLk
zcEeCZkrXsU`>bkVWHq3{l+#X1!JHrAw*E^~Qs%kGHFGZ}_!N*2`D@`RHztj?=LbIy
z89zK&R(N({-*Es-Yta|~Y8JHx(D|T<Jo@wF-FPGQmhv>Tjic@Y4mwhK=w=|#Q~+0Y
zuBNtuz(o+nH{}%MJ_4!bhU4jzN}f!2g<*)|C=MT}WwRUTq;$#qfpa5h)pokixzBbQ
z_%$4Xg018v^E~H4yw)AS{4WjphH{y9biU$ENyGfO(Mi{-pFSK-OA5}Hr4l}UM-+6d
zH1bSdA^CGE+kV~ouP;BSA)e;&N{enP?Z)3G`Qehu>^Go?hXx`c5&(Ilom**0=FgnJ
zzvgJ)y1tiMtR)zfQ9YQSfBl_&=My)|Af&0T?6=<;qebu8nUw>c)`?IM6>W>xO~!v?
z%=Z8wAT|e3ZLUV>kefjBC<JC!9|*3^Z2q$MTGuOdQtKP^+P-fe4V%9Gwd?m+P1A+E
z*Br`glm&a<@!w1>nFp>xSF{$YQUzo;gz%2z>h=ePI_kHr0SqQTS#p1Kv1<vqHrm3|
zpuK%_zFFa!o`do;knil#(zB%-Z$WpdO}-ryS^4a>b!%Q9Z}SQmpdUJQEtvZpY7dq#
z8aVe_rCKXAO(v|uH_;xAnanP=`)2i&fm_Ge=>DQ+<R&P8&UBpYJKQ06UkuQiO$1Bx
z2g6Z<880SMfliZyva$d~rcLQIKC@pN^VagvvW-%^2*#ip7;VM#{A)2Ao3maaj%b{6
z+UEcx#I~RWOQ1WJ7E)MQQSdxSA_vI53l)w-EibSqu4@{4Z93_J;m!D6K~M{6`UKeF
zTY!A_81<T5!}R2vm2lz~e&xmmm)25=CW|)%O5GT!0UuU2B)2!-+^d{Zig3R!G2_0q
zitw^}qU|emUAF+Nt2tQdq~Zctys|Z1<<{EZCI$f8=^IQI$Meg8$g>3Mc-j{RJlCgf
z%NIM9CbE8wd5U;#dFTPh@t%=p`R3&NZHtHd4%$R&YF+Eo7RZC<xAG3X9=ZFib)+h%
z3?UK>@>RG!$RcZmV7TJt9U`&T5Url094+XKph15}p&sNsz#gzb2o<qFq8;3;QaL7#
z!yeSLE$nD3F+H9?MwBN{LUMu%E7D`=K5YhAe#O<?&2SZoCt^OOB`-p^Hh!tN|Ey!M
zt?}JISo?J%fF&^w-s0fcPxevR2||A8>O``S6es7^5PHh-i>6mc6A(>5K=tC5S^#}t
z04v%LzgmDG+u|rkEqSlcR%jBaFAQa;xr7E?l9pAw_<9w*UL;d`&@POLwC}}K7T68+
zB3jPb=#DM1j6fM|^+PgkmP}9p7jzh|YTEW(8&93~CS!SwdU2`wa;-DO!mE8}``E2x
z>6V@{3Fpt#xVd=N>_2x!WEPQ;Pf`fU?q7Pam+E|wjgsa<>91QM;#$YdweE!>;V!|S
zWkUR;ir}I#gw@P`dv#`|gjQvt0oru&>LA+u7&s-7L+V%W)id8qE{_|3GY3L(S~IEZ
zaoD3;-5Lh_z3?g`<EJNqNY!RA4|;_vG!;cBfcN9vZQ}1rc#IMG>7W*u77Y8nX!&i|
zj~+hjNx*q#*v{5cOWLT2{k#%sn^EfN7%Q>*E!^<&>ui48-u4nF;x}(U{lNlEnp0>i
z;h>F#4pBwQZP*0?r{jyX7V7g59=I)lMS0s^2_Ga*;fojQSRl(~IdzbcKW%~QI^eUB
zl&?XJRkI<H?8#P%XSK$d^b3{<EEmko-5wfHV-{Q1;Otzzx6%|hncxYwV`Z(hBVi$-
zi1mU1DM@wffpN~%J<@!a8aWMEHLXt~)-Ds`@!Gi&R%%A<m#^Wsl4K71ouN?PNGu`<
z+~n*S+b2HOadiXoD$Vg(OisDr4;oHuW9f!45!2S8NXCKZjXyM*)1RR!)@qUpA^MN?
zGZ;8N6Ow7>D^W?IT+0U994DJ@fqlf{akH->rmf;5_&&Q|@<Aj@C*i354EN)PU%I52
zV3#XFK}%<8%9zjYtA3P@8<M76=KBk+*X-F|`G!~TWnUQwvKF=o>tm<Vpss32^_$^r
zMG0!+9qt$f``Iyu4`~F62<J@TY^@#wiV+4^_<Z?T!taPXL)-y^@GV$>yXpth)bfks
z%HkRZjr^u>A!?Pio~llKAVcsOA0;a<&lY+u5cy0V?W>?OMr7--#>cFnaXSc<1Vxis
z=GO@#<CTkA_vh{ngL%%&!4fKZaj{PHBO{NWa*?x`CU)_cngibw&MO!(ZXV%|(o$U4
z7W$;j;m2)Xxrt|BC$-hQS1z6{b;W$`)nG{%ziyN~cRzJU5P4gH6os-+N(osWLXNOP
zMH($&H|rRT^vz1LZ6xDXDRlYkBE1HzGuV_fzcVO1C1dLN(1fcR=gjorW<ScNyU?Q^
zk6rgS;$KrF3cw~47F5?>nX87XxlnKfrL{B!7{48PbiAxezFKzfbrOvo1hnjal*u5H
z_t7QECk365thh4-ySP92@|VXE_3GsUpPVl{wm;k&z8W$|R1jBVq<lXmaoum`$%y*n
z_19j(bL9qc%6@6ptxu}g#ynl%GI1YjI&Hc?+Il<Npw!K%u?YyOHUNy*oMDba-&i)j
zZ%9irURMO_ux*Yi6hpUo%jRmX63Wb+yiN0t0OQ$m>i5MjykBggMRstm|2p<Qd+HK_
zo<x0r44}0KWto&CLqF&bk?#&+Xvn-LWavALetq*#JxGXdb}KC{QDc1N%iZX+PdvV}
zJ+{$sU)x?C@np*2fEGu=D0;3AM@=^!A;+<nCf-)mt#Ow2iF;<7%~K4dR1n6&{Yws$
zM-s?FVx7TM-pcI3o<>&SR{Sa>La`yyzb|T^5>bHBg0T*pM|jZnRzb!?4)Cg8VxP_j
zvOFFF0*@hUc$RM}XPZDW=2{mjm%UqEijL__g8zsc`|FUy0=ZX(!p`Wtkm2(<QP^BZ
zt_vy$&H#Z&oBa|vs0H-Ma)!tnx#gqEIdn0^s(k}cBwbTDO?PL+ofHQNnY0;Qv;gM2
zfmeD!=TWtlBfRKTc{WY;NC>)+D<9Wd{c5V=#IRPuP*KE{jCufXDg&#a<&yHbtcJla
zearESnk=Fy(90GgrtT-5q)q!8%UP1MzI<#zW0RvP%&(Q=m!naa<Yr=F(8ep_`P+OA
zi${IA-%{lN<^-9s#v5%cP#OclH{+v<-ah4|{p6-R@j}G~4yo9jmQ$g>)A%M`R#08X
zpNkg!7&jytsoWkaN4Op9)TxX0T9d=rSx)<!Qrm3W(&mnhSU60?O?09@zzpE0K%w2-
zK@V}JcHs=Js9r^r;1g71L7qWW*A~A#q7Uo(GEtpaR1%KUefG)e=wfvl%gf2?w?+l{
zr}ErZx927T<jJk>M+`sShYKgXz564%l*g!|KRTbInChYDP#Zta)AF3CZPFnn&sO63
zHNy(}%HRo^hA@wk+~J3Jf-VvS3=M|TR0JrlB^}vF<ftu>S1qQ!XxjSLvGS(^)A(I%
z>m)D&Zn%|mUWoyDTt!I@y+Wgm!tk1t5grR~my|BdxHryBi+E+MC;QM<QYZOxtB5Z~
zhy(XT)26^v#pQ~z!``<ahWMuYii$LD2&eSAKb4oYa>E-BXK);>)yOfs>-Di7oH18b
zza9NuAm~fU`U!m0vk%Y|N*4P5DS3_D^mM$HoT1AIX(xBn)u3IahA*5nXH2gN*uG?W
zifPI+J!1{NZ#oO345VD>c9D*&kB8s#JH3<_B~b{DfeQRumj#5oR#~wi^0^OS@G{P0
zc_PJEfj?PKOswg`n}M^|FXN4`%6nd0&gIvMn;VO8)Zn-e`U=kDDKE5cq$J;Bc4}9{
zju)V6sPS|QmdwwG+x2?fR@UX5PMYcpo3<9XP+Z+;%QsUZT%B`7^jq@aObs0;#kQ(Z
zd;z(Nf;KxSLu$G(P9y2^_pGeN=z^vwf_ovYixUz@7ms-+yVpS``)%`T(tJ&lNx<zP
zC1-o>^9c-1ZGNfaAea{gvoZeBJ1$M{bCq2pH1iKhG-k7{TA`lRSo^s}v8X47Dki|a
zD#NErwtKHCN=bQrW4COLJ{d)24D571Z?qaIiS>=D?IdXp7E48%o#xCQ;!$o`N40f<
z@<8lC%HXM%2w`mlC(Smi+QdiPQ6=Y!wG^E&j;ANdl$BEiPHZIlzjj+W#tUHv%CDa+
zr`AV;2A#X2$I}PV)Nx8!w}y$FukjoO3&ui5N#`%VWl4IpAAL@mO@UuLrh;EYF3BM=
z0<5t*X}MR;Ad}2Ha^rGBuR*PS^<;G^%hm+WiIlUV{NqBr9`*b$-NX`5cV)`=RbWF2
zEo$d}cARY?r}8uFxK9svE#lTP_}Hm38Lhz1be7|)`q6Qfw}ntVx>m~+Ti!qGSLvzd
zl16huXLl(>)^3_Pom6tconB1A%JoAEL-4#(3j?05wr#Gqf#Y;K;}T4ZEHU^~uKh5=
z2zkl0+u!jnrYu|P#W;hHi;EZQ<ck;{eD|pt<-{%I{F4c|87CD@@(+;il3u;<&_8C(
zZ!iWrnxOf|mz+%LosEwV=UZlQ-~tabJpl@6+EbQfEB-+2$$2G*pWlt;F74Mf|NPp(
ztejNYg3g2e_^4aUW5yu*<v`D_PFnpsUbR|Z{RuixhYWuldZDx=QKIW=8k})0J1j?a
zflHYTZzZiYttyH^=6R$n{bDRj23hI~;pXz%V6f=S%O!8u5)7^dFd|7^w3;Fz0TbDa
zCN-nrZYVeqf}%ZbWpFI0s!b_?vr#^6Y#se#{Ac}%9I+UmvEiYvx{}#5r9~J<d1boS
z*--!UJ4MMyt;$vV&y2WF<tdcNx=D5|1V=6L=@jrj&5nh(%M}s%blWQp>ek{i$czSZ
zwKNl=%~NSc<cl8?>BDYS&oa)?+%URDyj9Eii7cn+GO!Vmeq6*GDbUwrDYMG65O-M$
zG;C|Fou3miSCgT~d1G5CJzE=|441H~^)EVmIJEo92U$*ho{KsQ8*t=2M^nR72ZKVx
zS`gvps=Pnq$Iwe9Iz^Lm%@sM%q2t#^v%%YsXN;*@Knq?X-)uFeB=OK8iX<QE$%pGm
zcDi0_3Jn*PdudO-SULdZvKqx5wWLJw^7{auqGLSq6@(k2uHH(5UK;cwrfRWtZWD5{
zm6o>`=VU!e`4~r15|_r%2UP^1!Y;z`7bZwddeX2_GoQPxo}{x}Zo2F|kcp_jr*4)(
z@e(Zj{pjo<BliR*$l+r$xU*%ajL!CfFa)MG(a!=moI(?~etqJY6Ubwxsqg5*0R2Xm
zc&ncvrth3au$oXLb}PMZUt|^Rs;n)&+kTCYAqDKmw@{nj+_{?3US7q9!z@GzMNo|0
z!^%-FY%Wq4<a9c=mTgvtfux_IhMaxgBv*K1A@w2zN@)sAfI;G;{f!eTWH?6MdFU}}
zZ)t32=KvZ1q?rbse~~Is*0Kc-6c;nJ@6dx|2@aCqKk#Kxo>ZC#$l6P64AmpdNQK&-
zOs7i=!}>-etb&?G^lYa4jqWj)XRA$lwif2B2W;`Gs#Q6(NRzp09I<{^YE(n8#n3*N
z4_uMML|ATy8gq2s=j$oI4=kc*J~<8zU;M9BtND9I^T!&&ho17{r<39kJ2-fvR&??g
zXe0HxcLiZ7P&%3!exWHS_eXBKOls9y;x#kZt&LS@AcRKICnm8DCstiM7yC`w0HI{Q
z?e^FK)UXbRN?}h1T81efr++;u3F~F}Aqh%NtX$e99`v@<B<sp(-1bK5vIDOUtn7x|
z^Gi4dr#=Sj-b_cQr;KMwLXyaSFs8(lHEl&GjyE6KaD~mgL>a=_vJ4UnCKtI-PmtSH
z%v7z?^3&lUG0I?@%<M`C^RvoaDd%rZ7k1rvYjr{9IdrC7PX<)lh0QRO*sq)s!*zhq
zwlZ6$V`cjX9nWr>*Nn3%FAOh|9`kJN<zj3-!VL$~Z+05}`CZ%=_rMQMrhHV<eduwO
zAA0gC>%~QA=FRrDuBnU%5VHi?wBApFnpIdu;l+%{tF4&OGa46mgpYuAeTb<16*gY3
zGGbi%E@)#hTM2)<GQ3T?QO7ROp#aiN@xS+#4Qc*<-Gx0jk`5F&KY=&)NDj~tfpPaH
z+8b|xHS<&4;vW8$_~n?p&3Oj*1`3;Q!(!D0s$1wMoE9yx?dx1ynvM>UHFJELX5vGY
z!BNG(`7A4}JdZ^p4||VLYo+`NyU4WxdopqH0&N(NPHueDh=5ZlR<Lcs-j7|)fLvGI
zpWCx?%&oQE1=_53W|nFOaU3?S5oPI}!|d`6iEHC)QI{-&wKaoK8hb{J>{TRIlf{v|
z`7<rcm_4x<zk)+aCO9k<H#N{Ji()ZKavCXCk^{#droh33<Q*f!(ehMX9k75kN}tIQ
zaR$8-N2}2UD8TO#(q3N=kk8*(AKG5Zj)ju)a$7dzg`{67hBi47RTiOBrj2Z8{Si$g
z8mahxk_F2A!C{S7nMAa2n9l<xVx$uQ2T*S?;!L$4r@_T|+)kjgBL|?t)w4YXCQ;TJ
z-u^6{@~~nWv%W)TLwffDtMd&~Bi*^nbEEu|U>~YSJUsM_SeP7vWSnP7KY+!-dLsGe
z3s}s-vC5f}$qW*U>Jq!$>V1Q#jOrZtfsgKUS^mjxFQ*WY6==p*fo<h@GPfVo=8=v%
zXxkcXue~Umd2DoXeD8n~(l~>B`uVcvK*XIWs)?*O&1kE4ubTC;LXk#D5mMmi<Eov)
zeTbeC>yZ|D0$@1)9bc5hb(0UkU^ltHwvZmPk4tk-HEZ3lLiRbB9un4;XJ>J*4%<EC
zrj4=c{b>`h<%I}z%qqnRlUAe$$cO}qaJHj-UUdRN@89isfynFVn=RJ%nDn*U0G|h3
z#2^^;1_;E>>tF3*W^^~Dr(j1^ze?KMMwVW!9ArPZlyR=mkLY=(z;cYkv~`R$i1DWq
zQ~_K@3r~9K+`g_s@AgvA<2j{#xh%L|YcTmMlNYYwG&pP-1g^KVi^I$!Q{AcDawk{d
z)7KQDQ(ZA}f=oS$oVIB-$(;X`JmRf1C`mUwwgw->Jb#0+BL$kdFxTmBJQv5*=<wy;
zlcff>5R_ds1XYP@&`25==kJo4z5Aotz{)XVs{$o;$Z)t!2hxiSibv-tG!}J%2me+G
zU0a#z+oho%n*7YeQ!O{qs<wu*7c(KN?0u9M&iCjg(^d<Olc>2-rR@~{%KJUnk3XME
zL=CTlp_pq|u;Es_6D{w^FjrfhK|D0~DM&t;>^4Q%&v{?C_1@*K-6Zw#O?5fxDmtQV
z6X}=}8O3edyX(g)<gG9B97T9!0Ih##sN7trn>G?#&mt3@$fgNfGx4g0K<v~pQD+<Z
zgKLlnOarMmCqD$cx!<1{xoZV1tgSP0p0cXjFrB%sDmj*rf5L7|b$g{=GRxxg>p_(9
zcoVdu%-80T<k~7V))`$`YBjO|o$b$hUA25}4LD;t4S`iTN<)mVGK}O1nrumN3b$K{
zz8=l@WpD~<J0W4{Vyeb_QYlw<>sQ?N%bE|!IZ~87qsLzk3(1*?Y4LgthN{upHy*mc
zovq!eVNwIS7U<O>3KLqi)wcjesLsmjW;QEhp}irBZ8c!bJMw#3)kNDXj9ag!Ku5f5
zL3c?>knkvFeb}1w!T^DRS6}CC`pgmND5eUX(Nl(0Dr$zP8KE4&to}NjZtVBIL6HBz
z-J2b}kS@@1g;j`XTl%BK`jp1#h>D$pnf>p4TZQ#Ss2r}fRzrE<aLl0Gq?M)htKpjq
zjWYDpIS;N#KQY=M#U?411F>JwV`Z-CQ!VE7ie$rS)7IIFYX7XL=ilMm1fb8@i+Z)X
z1W{<mPIP1zwH;qS#(e%mP?GQd5urH^+?i(?6pTUfOKWn0(f$u}<|-yGEyf!02Q&&n
zANQB?<dwA)nv>eJ46%~IknD1%LZh@N`a6f>{v7)VZd0}5Iq)~s^t39_h!8j+AS2Bd
z?nC{-9qug=g<M+LeQC@=gKDi0*AKDDzr<euRr$i2IXs>K${w>gv*bf_o|Z4|CTef?
zFGK#_U;`5i!|p@e=`)MH^4=d4<pRqvKXwLtnQBd-2U8}1^TEiCd$uND3uiB7a0K|4
z&-@*S-NkBBkg9Ca-)a3mb&Iy$-dyI3d*2Gy6Xc-k#DJBYLz?I1Bl5p*BYo042X(*A
zw<+=UfaY7~8O6onX?Ut-Y|l@n694e(aLaFggUM79p^|e|d#Af^g&3ssj<6<_6s;G=
zzZtyVC2LY?nP*m>!kjWTlE6|t5p=H-wLE-br|?sJNn?2HRJi}VbrQ=_vLSO7h5+A(
z7lNL*F1*_kabY9^l&pkmN5j!Y-3momSS}<kQ@o3l&**mQ<X2+c{K|kkm#oTa&?@sS
zHRg_D<fNc3Bo%MQ+BH-Jln*haj7cP{`x%7=F=~tZ@GW!1m%-JL#3b?6mX^c_q*5Z+
z$3+PIkGI&%K#0oy?8yAPBP!j{iP9h4ihuB)kMUS17+FUwa=gEvbM#~&<5B%e`E6Xw
zwXUl!{0TFu`3DBMHWqCX9b<fIz()joM<eKE))S}f8mBX+Pukr7QAm9K;JZ>Mp9<%i
zEFLnC23FNh;;rM3&{{qJY4XNDi#&fQzd;^DYl-tgozZNVgCS)P(u5oJE-{?zftCy*
z%%4HOiwda^M{C!HiF_(QPpprBU!czUkVxha8Y4mc;z9I;&h{kF)~obEzE3VLYW2!n
z6iY{(KPfiQCD><k>xH44Rq`b(bTP=bYW8RmYSJsi>B@^!7<7E7(VK<mefriXzpf7H
zxlb3C4@<F8&Z{LBkBLD9GO34g;h-MjtkD_kSOIk@uQ#i5Z2bJAnk9ZUwI<O^D)jl5
z(=51=uP?4+pKG}+fi9xfO!QPmyn`~E{zj-W^`w9S>SRE3!k&&3%Bc_3CLgR2VKN*9
zn3~5*+S4THTrsABGsn^up`TGlp<A)Jq&Its`9P!wDQ=`NuE6s}=TAj#$0<-D$Wxq)
zwLv#H)^ofA@HRz4(}mwb_!iOjLAisZU3~S`u2vt4d-y+@5S2%!=E_l1?;<D2BQm8b
zWa!0>+aC&UR$kIgw>w=A$<t`L_zVNCHsKj7OfobTopICg0@OY~GVN%uuOy_gT%NSO
zgNqH#8Y_0Yi%vps<bMqob;AT%djCMlO{=>5q=1@VC+PPims*y44_{@mbp*W_cLHuS
z>5A+56NB<+-KrG{6`I<w?=R_=LeZ>^ZqdIHMbD|fAf?B~N$av&7k3uAu#BJL0Ui8c
zlEF`s(J9`)yD<RUNWmM1tJEvNU(Y{WC%0~Pc*kfA8(+h>Y8ZFQI@Zb4kDEeoBk}Y5
z2Sj^q3OlrvCK4u>W=dzX=V|ngp9Pv&==id!mx+t*Jpd`VciB3Pg0@cq$aQHN@}Ylx
zh3TxW)w_qm2@g0|S`1H`m4veQeO7XC49dn0lXLo=X_T1lm1m254pMw=R!&1DFi0n8
zrM4`W-}K|Q9s{j5TgFB@?jJFCgBi@plw7VPWY-7smNYE%WM}(6oa8C$Z*xDZMgdg<
zt(&*VZX=xL64B<NMX(Lh;n-9^a!DrRI2-(r9I}gyYn9dej=xL$zAL4($9s$zy3UU8
zC$I-IVeZUKYdIF@*;-q~7JRuz4~1rvMlBd?5bB_bOQ|UzFlOO~(Pv`@b<pzUpsrJ&
z+_=Pb7S0!d7||e_gTfi)ku*ks<dn?^1i?tBX?bC|0+eU4{+-oz-IePzqVAw#rksTF
zp|%px4EIdEir<5$$~wg7_TVV@s)wv-(?eHfdt??C?Q^hl%o|U?JZM)stu^0{QbflX
z*5A-lS5~3(q{)0H?shcRx+xAsJY6BOzwQwK*tK^ZtO#-Wii?w7%kn4AaH;7R7@dWy
zq1tsiM`&%aD7^E+vlPiQ=mnLrZTn+|&AqZofi-*-<I>Im4Jlh2`;{VMg=&O*^GDF2
zYHQjdQnCuy48K{r2FvNgmFzLywnYP3pahA1He8+fxsX`+KIy}7l*YQ%&|JG}2^o8V
z`f$Q6%OMZCLIZ-H-v{$(+wY`#=>a|`NEJpB$0?H9bCB1GG@8_shQNZEphz_kILfG>
zqrm3tD7gU+$XrEwFv=)9k1`+Oc+XGw4N-=Q!x^`V7?+T}kU43};b$06^}BJXD&>`C
zIimWhG*jp$JDj!$+x~9t97;_c-p)HkVnmUYge?6v0|+jHn9i)omNwkA#34R9OOn)Q
z^VEs{j&5T)bzvHt`qX9w<qKH9@J+w;TDta!o}Yf6fa!@C($h(@@saGJm8P(u&1HYL
zpF#Xv*X*HmpA)u5;Hfq76W}xFV#BT_>GJy+P)l~mFII!H7*}+L|Dpo3Q-oCz#Wv29
zR56O4{W7CPwGZiFZO8DKzbV1j?_e$UFDHq`aGw~5^l`{V?bMaS!Pqw0UnbE;2}(@)
zo8xXOORmonh->1!w%#f}NXb=W3oVE*mNd}H2$h3J@~&39=2?misAqp=d>#LVojiLq
z!b+4>LPR^`nP~|F`?u>_v}522C1{^iPAh1I0+LI+tLVL%DbkGeCeXbrUzaVmw8p2s
zk)WNg(Eb=Iw+~HFPXdiET@DvQ+B@UcsGO)%ZE=a8wqLDjn!F_z7&yl-qTXd$jACdu
zTkmNy1KflDLidg)9>`{sF4Tu_>9n8FaXT^rDp^iC53w)aJdoroq`x}=sbbUH^nx9E
zxhfNH4}WgAw2Y#swUCxk>b8@*tzgxd8fBtT2Fb1LZjV!Q*$i46$o-_Mi^eza!wiD3
z?+jN)$VAGT91kN;yqi_)%!FdPFL%Z*7qxjuTg}JGL*e`)vXnZQ65h%&qDuMOX=5js
zKtff=Y%K!?i~V^OnzQbN7LJDbz@?6YTO%AD*3jPB<exA~+771+ifW56=y`q}5bLsR
z`A+5MqT4w2$^3YuC~{zK+X<pO1yJ*MmCLe;^N$*wm7@}QA#pc7fTao}k~ZvkyV{m+
z&Zc7Z`>PmyuWfjcqMA{9c6xw(2S@)wdV}aj`8w3Sd6|Lb0OtI(AeeQh2_^FB@GJ6)
zDXMm1KVEv9%E^SPO0=L!#~%3DNBXSVn_Tx|)lH4vyon}_Gu|kGw!l&+cgv;yI43Ph
zlX8`pD9?n?)6q`P-kn+b@FS@TS^D%d$LS88G5uL5F#WQIOquS82UqX*Tb5j_fe<KU
zUeagWC~Urx&k`ei7hAM+CEvd&z0;`WHvg$%7$VPis{CXqzTxcQCjpAp#nT0Z8_#&-
z)d2-$LaGL>WTQwcFQ0>Ixvj;wZ}mZHTs|JnKl%dlL5-<q(b=qi6!jz-71P+}Jr{95
zM_BI>=y6U5Ws}qMu;)05`lNu9M~4FD%^26^&DaW~fdw-)-sjiB4_JeZh{b*Cf;)`o
z!X#QcuO7xl-~M=^9<?u0O_sj0nc^fk2#gtsrUytQF_tGjy8su%(>8rR9YR5Q)N5XS
zpOc2k4^w(9A>0m*aaE2nb}S;jTkX=+Eqba!YAWGxH1?+2f@g-7%Yht4n`Z4BX4tS0
zC_m5flrm(DU`h8DGf_P}R-7BVCZ+Lkydr}pvmJ0X>5JrxyD1W@Iq_CZ?V7BT{jt$s
z%84)4R0l57wa6a*G+nVg=<cpT{zuL%zL?mJJk{SR>XAIPzCh$t>{uImznXkDN?Who
zFS}^ei7H62RlB|JyMUqvp!x=11Sg1Cxg`U1<@ETeo;uA;H@pqYM~wA3@ypMj)v$|a
zb%-3_YBxz)pm)XNdx}l(@cKP0ael#TFrgfsg1Lg(Fgr~C=<^FqtP{=?0nViNEexZ(
z$~vE<+g-VK*0%Gf)4+<6+i>>htX^FDM!tARaOUSqiF@d9x#vABT;WUY?RdYk*$z~6
zhG>pqe>?fr?UL3?u5a_}=;}fVSrA~-RV=hf`z<_u;qXyO&$1$#*g<oRmPENph_rzZ
zw$67Rf7xeWSuRw0{vQR%$I%byzjqdhdG^2sQl1{A&v$&-arqAx;I`wFTd@X}&6ZxP
z0Ns!9I`Ib>8gS}7=nJ|-kyq}jB{0NE>V4->OCN$-h*{Q}06n#)6y1V9xJ{<Re*le<
zJkZzJEEf>c=lg&8)$t=A(Q$IJpYOhUh-Z%+Jxy-P?(qGUMK2&?Lhld}4p7(s!*7&Q
zBZXpC7}O;7M}-S57vN;TXer`L<Cn3+^&&5B4r!btnFlHKFcszCU7_^{Ux}ka(9(h5
z8^uQ>G_s%x7aZ8K0Z-5G0zK7)0hp8GLGAd3(i=;%-<*NVr{dFV_&-G$L0j$0WP5rz
z8%|~5<K>cfdCkp6mU|+=he4_5-i3n*y+YAQDv*d*H~;i_7l?}KuN{nkq)F0l0SbO8
z%jIX4qaqe>`Z|`25(9#Co}L@|G@SEv?+kw#2^s-Y+N*svHCj9N=k;jOw?y5+WaT~T
zNY|(z^jx<+4roOy3RI=O*XAO=$2?kCfkENL<<6eBlwcnRINdKBXZGyiarF^(%07Tw
zPJ`LN8+erd3cy3IgW6(%-u~tt9aB|$;>oNx!3-7rmMv8v_iH9PT@c;@V%gk~((R51
z8sv;y;-v)v8kOwdqQusve^;5;KV-UM=jGjU_g1}4k`MHw#>2q_J3o*}iPVOMxfj7a
zaTYY<w=(*5?X|`+15RCwTgSHS#if$oX&|!~LoW<J<uCF2H`M8i+ms%V4lmVE^69|1
zIm@;W-$JV<HBu(uunBiH*Ct{6HU8%ZNZf28m=pj+xj9MzqxCN8&W`>#EV(Qq-iaNr
z+JbIiHY%a86aArC_y-dP<nFCd_{W8hla!by_22Q1qR1nj!n<m8B&9>5MUzq$wdR7c
zI-Ax;K7y$P<i}*W?bH{(rO}QL<9bP}K9XB2w<cQ>9x~s4cWC^cF(rv2YB(uGV!rcM
z54gM{5Jaw4bcr6WE|5Wc-zr+Po-f+2B;6KT;D*C8m%q;}O793)b&)JAcfsXWto}x*
z+j#FJ=x^Bqm^S;F%XAR(Y*XLr<dR~6q;q@g@E)XR*^u*q<;smH{zgb=)Le}ZDhvTS
zxQxsIBrnZ8RT^3q(Ncu!gU2qui$0tU3WS{?Y#Wx%7b(jc`5=1~KuGQG><RuyBHHL$
z#-2l}*#knS%F>mU_!nE-Z(#n`0|kI#No1<|Zc_G5dKmDiarXi2JlF+yg@s!y&WeD=
zH=d2|=FH3r^S|`&7tPKq|LW?biXpiJ$|8-QLdtiFs>8%aUYmvD(ApSu6yZbwZ-b0l
z#fe_2#a!`Zv9<ox%X^x|eqFaeeT3Azc)w*-Ej+6mHOD?`7N(*JQ9UE^m#V`TLBcm$
z;(p1GT%>R^Q3B(dSf_-M-KX2F3PL3jD5dV5({Rax(op%ytY=&lI-Z-0+6au+1~5AF
z^4wXVdwV0QWh{~u#JH_NGx72A1GlONL|ULB`I6>wY?avy1M}_G&t7F#wZ)Zn+3Qn)
zJi9Ouh@7mPd1$l+GT<#E&ipxjzs~?7jie{M3l>j1pv`~4fU;|SBw#DnM@nZH`hJDb
zT<nRycq7nOo#t_L6%;i$(X+CIN1N03UV(ax>Gm>G8jqndE~&ZHe^-Ui=C+fk8}EVN
z$-*vTN*O*vXjf}KhIR#dd$urXS%Qn&#r`g)J%E@#=w<?|YHSster=j#ggdhhL;=($
zszd%xakyfA=2vI5$k(Tr%KX|ne$ux*ruXAv97${|y=RZy{9gyyt#?~xJN>df#DBBF
zzVIMEKJG*xper<?T$o6vp7r+dtOSqwv_N@(OM+}U>1EJZR%NlIKN5F$lzR>Ut360n
z2fDBYLTldYEX3$cvyGLXc#qXPqQ4tEJRy%-DgOR?>;Z7-)LshjM~DI}#E46+0mLuE
zTMMk#NLva1eLx}wdpulKZK*0JkaMf{`Ew&pgVr#N2XX;eQy-0<2$iL3Tsl9c?L!Qx
z$V)+}HE17X{YD_ZFy;Y?#J~1S1mZKnu|(m7i~tl>r7%`c83md$d!C$oZ3*)1oYiRc
zlwxuu^$@h{zYJKv=LeZCx4;bO#ET3K6@OBGTswTUB?Ln}%)hHmp3K{n$ag~+GgyMz
zEE*crwsXVPqdltV|JZl9f;v|t*EuqTSkR;)MICDhsH;qaQv9+rAK_u7g(O3%CNc&A
zWzI%MA-gG3o=hAbotIg)O4D#vYEsR>u~n`Ugb+oS`|n-?Vf3$T4v=U?$>Xz1XG#Xg
zJ^_O8ed>HWEi%?X;so+X056FCINsUO1%9VA3<CbY1}8Ql6wZ)xqsLzRNL7#&RD1>`
zJO&s5Z;HnFNxjt>TJAf*oC^=wHhg=g;^rNiV#toBVY;{+eBQE!b#~kPH(ciT=div)
zL?B6b8m@}oa7vrER=!eW`tZ0#5EkShndgo7!}=dMb`$Qk{Jm6{t4bWl!<O$lNWtyv
zFdC`d3zD~!%E$4YXw2xZ7N*R1B>c+XA(BpXOaNdQk4%BySzmSf_glf(2Jyd8OkhYu
zq$gE<h@&dT0SuUrwD=DxFsb<nBrA@dLIg?qEY;%gL3!Q>AYpZ0<s@b^_VYL2kw?Sq
zNyt_Qpsh{0e*7HlbFg1};c$MrkD2PuBf;pi{Iy?LbHm^ZGVFZ8Dn(+LM#g@M5hj2w
zwds#IF9{iPTK!>Y)c&N8F$5Y%`J%2M0IGcA!jG_q233J5zA4f<h;^Gt$HUr~-z8sq
z@~R5ZWm7wcldTkxZ8x>tGyY>e;|ID&90lj=cNmLj&zpZd8ZG9U4@%i?&fxp=E^7%T
z)JzN)cbPa?$K!#P;wBx~Ssu@-YE6uoA}GgPZVd)i*glv9DdF7OQ>BOj;GZS_>*FPS
zJ&Suc-1LU=Zya>~+9@=T{}!4fJ>0T)24IXWqH7@HX^g<Sy}@Sn&{f`HKEZMrs`qfD
z#t<JNK67q3q+|ljqMcwS@NGiDkK$Z>D>q}u9RzWD5`oA6_0~d>Z9;@>lLzPKdNDbg
z&7xlIY6=M0*Y5xbTE#EwLRbD$9{^0s{9n<MC|)y$8rvDLd3tcFai9a5f)bceCn3Up
zPIN+pLyi@cfnJB28#w;@5UKcfV4W(K`=^0<7ON_WHyo*V(H2_}!}6PBMf;@5m{GoO
z?og0~QF0rWb<-ArLC{BJv~TamLP4BzY7SU4-L5}ke*`TM9!m!35hHlzgC`{V9pl;x
zO#0*cC%h2t$v@je>LLi+{gxb^#ZAFM`3ztJq1kDD3St4C^N{Ks2jej9AaHT$!}nTe
zMr9xVp%AZb3>Oc+i8Is1gtbn52Qe-G^r#5{weVme1v;*a{j%|Oe?DU;XM;SorVhbd
zT;?5CIf8GMKkgbB)CHN}?L$#UoW!7W9R?u%TX@(_Q7qP75S&mM^5RSk#W1h1ZB3Lj
zl){ls(=L-B8L>Nnmdl^TG2T|#t&4s~i+iwe{nPGf1fA+a4E&jtPtii3A*`4K_)CG*
zJM6FvaE_)TwGH!Z@jdThM69Rolyq^1q95o;(2(J_@682-cPZtF!v_fA+V)YOps(2r
zoQF~66aW1vA>^Z`?x7K}FBf7B?Daf|KLkkK%wdu$3m-*0#}G9LRmU_yoOv5X)H#Ys
zK#hNPKoUVX10X&{53ErliA(l5@=QVa>V*5u^zU6zwd0sA`q~@;TOtfD1L)Nwd;vYc
z6=y-(17{_S7*e>TgUllB&$)GP8k9O0g5L0RQ}Of&QQ|F>vp=)*%V~ITGyE~nwGQb3
zIA6%DADI;fQFKsJ{wD-FfSz<+xkM`R${w7HU;S1OK=<GG0}&kwWMq`eJ0MGD713#a
zgqfyTQh~}YLqy!IKz=;Ge5v<Cu*gK(yd5_#Q_C1!dk`df4^-18uyIAue{Y51f0h6C
zAPhOzBpsyPZJ)OeCBy?CsYc4{aS{V8F2h9j3AYW3qHXf8HFmwCc%F*5$V^9JvVYb{
z_<7P6kplA49K^DVDEZ1l_jFGyd3XLR4{{mQ;=Wkc6KqIm9Q1}E=_rC*y&ZsXrbtJj
z?`VO97IBF)*ofG`ExYJL54-*39Mq4lT}JQ*r@sX9Wk2C`Xg++{1^Nw0!h(lUq8>oR
zmD0>n{|O5LK$cfOV74QT9`W)43)R+KE%jh+HSD|RI@4)X#2u7>_7{eL4FbLsB`=Bf
z733~^gOo9Vk4Lfn_>r7)=1*Wh$RgHF>HQm?w3{IggTGp!=zV1+BrR1zJcRH_CJgL>
zbD~h`sY<gt6En)4H*KI<hbsr+m?6QUKKmf*^43nkzYnF$V1Wr-xP}|YuTb7O>;h;d
z8yF-zPvzD30s-J9ArJ8n|92050b9(&Ia`T%cz06G#D4$|o#0Q0AoQE-zwRM#_eJb=
z%HolMwKI^epvuEpCo;wmOo)`JQ>+sCFNQv&q%tCZg5`41n8Pw0Z3ZoBseeBM(%q8~
zfBZ$Fu?SQC@Xlcewi7np4l;-X6vTOSE<m1{hLj#J(5X}F-ck;AVI7_aCFV)IwI&6I
z^h=#z;#uRjfvULJ<qS!FAFzQa$^VIONWcN5!7o=82#7zS)XtHK1#AM6HkSeqMp_!E
zo1evgJr!S{dV|>B^I7VGH{GqFvy3#<-cTv@AmjoB5x@EM#P(x_lyoy`#8|@~IrM)h
z6F#p4AO5@dV7<l>QH~o%sz=~6<A;gC#4Z<l^Bv&iU;!R1d@HM$iGXc(RMa^OL27gY
zZc}tTIXvLDVzc*y`%o|<fG$^RLLn6ysW;M2-wx>zNQn3Q4;R!9KBKtxa{)rDn<GKu
zFCkT>2+8PxRvAP=Y<6-Eu6MV|H<rpILF_FJ#1zDS^ef2r0EOuMKM`<@aC_q=`38)<
z3t%=oARo+hESyo}Kgtjdh{>R~^wQ2{emb7C#%q0As1(vm9FRmWWC&}@#k&#V9>&Ly
zE)RJXfw!X$kT6fm#>W$TZT$3)z1u8|G#&aNw=`_IdwXtWhZ(j6v^f`zJ4o1-*zpBH
z?uOsx5cn)Abvo{tF}+hl++qgxy3nOxpoo5|J(|u#e2C<C+?mHu{Vq2TE!{jqlF!gm
zz5w5O3HvyamDL;SgD&|H7F&pnixs~|V7s57x&<WK(V_YRL?mI31R*b#!e}1{?N#g{
z9%yQ+4MRnn3y^KMBub#qR6WMsnfi9vngCV9G?WO7#8U@_`g47)<u$DpjiG~vWHma!
z`DfYb)((b$9V8ldBws(2Cyc0Bs)iGXpHsmDTtWxnPp+l-v!XtC8T5xic)SQWXdU<=
zPj^w6Lr{83+!)TT0<NYlGFKI{?S^U9I4=oL&Q1w6ofycQ4;CYRaMSxA__J_3*DGRu
zDRqu;WoNg8EmbaXgWZV&kq|e^?Ep-+9>kFXf!HtuwPv&*)vNW8cn32LWiB+ARbcB4
zSm&ONDM67Sj3rU%qhzTi$vZ=V5UE(0_NQ^Qvy!x`K)ZfW%!D}mvY#R1c`pitKl_aH
z%E#oT0UL<4WeBJnVD3?9H&CJrqV6BwzAeB#BH_QO$d*cVd3etx5K9hr_z<2)RaJTK
zwe8T<E<`f<_6+&oAkN)VAbt<LFjA?}P$U>}P*|FL2b%OtmMy?g7SLv1jiqVGtb#s(
zEf5|CFTiQOj*t$2xp`1Ms)BfL2^`tLLMy_1VAqk76Thya)7Cc$WZ=WYXuB7U!TJ4$
zBJb|Mh_~>%UfnzY2`O|uydf{|h5oTE<l6ZM1-thgC5fSVDQFud4(>?!LAOa)qFa#f
z#8?UiaR*l+4F<Sk-wI$lKLTS}J0|?j^Du@M_7a38cwAW<!c<reA05(6`x6Dd`=J!L
z^XnD@<6spehV5H{1ZzRCN9b;U6>ut)ng}PyZ`MT1;`?K7_$}1VO};qY*Cp8RAX_?-
zk=Y_$bp#>$A~`*8$=Inrpu?&CofvsvL?lvvQDI%-3qIg;cz?{oWjo^;H!X?&kz!~$
zO+BQR$c087LVGsANKEs@R{{3V8ol=_;Smm!K2otG{=tD&K%_KAA`E2DpP<<PT#Sl#
z`;S}-!c5n#$vY0)kw(ZH9Kg_krYh$MXD2T{yOWlNo|L^#5M_27OTYuDu0&qSo$~Z~
zo9^?`l3o=C0LDO?GPGmdOoBIdt{bF~grtI9^joOx7nIYkk;$Jhp$pN^)cZYnqdbse
zA_an{PT!pmJktOOd~$0s{SvV-<qGEXm=r+8^5C5O@VHL(pbC5%rV6_Z-(31EM!*j^
z>q*h%TD*3Bk4&Z~dkZs{|D!43J&)8dvs)>M`Lk3CQbFw{{w{u?OZF%96J3)^bRwcM
zPiYg01H8j&;1<03Wqs=UEi*WO>ihm!PVK(r1}O-jd$|P>n}si+Ky&!VVA9Mg?Of{o
z&(zPin1ehfO73L=No-)IeA#flUUJI>xo_O}6u6v$+mZM0e#2in)0Yt~8;2i+g-8-4
z${|j0(~wjksiZGW19~S(fR5c03<suqzVBIR%Enp*{VrH_XAxDwvyHL~InKi%OdH6?
ztbB=WKR0kU9{sn}J!~Z@D8tGPS&d{jz%h0bCTx5Xe9szzKW_usSbFgK?RFi6{2@JN
zn(>HwjY5K>Vaq&(oNh?6EuhlpQYSY(;{bC)L>U1|=NhfOoH_ZR12&KNfB$m}JdhHB
z|K}*PfoRjQBV^-?<5#1eX?r6ErT{_Y)q`Ty7U~e_DTJ4?#65g_(xHw{yBh|c<yN(w
z(9IQMROZqEX(or;yHS7VVTZx!C{S-x83TSRCP?lf8i&>9nuAGiA7Xcev^W_G0FWs1
z_+;p_DsX+YMed&h6BuBBwlostMOt<9lN#v1yc?vP;G658K-Pc_{wo-~ojxiK54z75
zBJb3>|F(iCRJm`!=kYDq9C)JiR&H<FLqD2e11Y)#gU@q!<mbuZ=l^{9V#JbM%Zq@9
z>=iuK>PDJV$DffDNPK|O(i~8-`S32p+9YgzACPP<=6Ll0EW`xBxNiW$^YBYCe8H~_
zdp;r8_3r+2xB<W8EpKWyyy^`2+e~ql(j*E?<scx^_W~pAW<As0r_<n2s&>yzw4QXQ
z7k|n808#AMF7F`^*nuSfL@TgM|5xe!pB>nMIDz27$jYq`SQS0^4lO;o!50Sy=vH7|
zrvb{I3v@jMyf_6n*z&(ZQ@<bYF9rD>Z}3<Z!lfOLRN%!83PMx)_+^K9Xx<AYZT57F
zd&2;_2T3F`?P<@JRRa2ueYY%$`|JDuTh0up!S_vKR(bR0u*=GbKU^19c*wZ=bS+{d
zJ1%rL@_27;uJ=I$`g@RNLNO04yYn`|?ccA9JZ%5uT{Ors1;bXl-20PX6yuHLau8K4
zpnI2!5G4_%hXI<SVlKJ&VQOU9kl!paS=?%gegzhbew2SkMaZ}POMadT5$Nq-<rnAj
zM#@YGjy7P6^C!R7sy8_&M1s)N+@X!gm3r`QQb)nXe>+vbw~8I&Cf6}kKo8Dah%I#S
zFDtiBVx%epn2YN=!Y{@H<=XT<w==iBw*rgnJLn!y@OTUXUT(KoL!NL~e`9yM{BKWq
z&U>7Y+J{^KNQn<dQoU!RZNdmYyWfhDB3;1f^Cmvbo+~4rVD>d{a~c<bh*Uj!614k%
zcdl@DM2RYYT#*sUVNo#6siE_cS|Ucs=E~rN8zpEvK?<2y%}xh(0MZ)h`@2K0{J7Mg
zbwY%-cC&%M)c79+!rSLv60vPgMa+avzy~)UoTqfa*k<0<GFWpTF*2P(5En@Esuu{C
z=zTni`+MMBnsU{9{&nWwX+V&Ey(y1_63hE31h9)|Z;_E?kZ*t&kKq3(tk`^|QV{n+
zYk&%83&4a4xpz(fya17Z^05cu@N_qvMT5X-^^dG(jJ-EVm$*oO2%zJd@;!1R7aSvh
ztkW{_y(gy7MG~%3NS1H)Q)Il(gghkMXX>y1wp1Bx$1fd5?eMxG#KQw)M4Ye02UG!J
z-U@D@A{<C~2pU)IfunIq209Z_ydUEPvmi}R`>P<{UBo>^7Vsrwrrj8!Uy_Y%qt34d
zMWMu7vh7e<yGeh5#7MJ&aC_<{x#h?@(in;s%mUOLf<gUQ1dmoBC%og+`Fkg(z~NEv
z-ISe%m*iarzwcX6*xr-F`vLLM3271!!ZL~Vm~TvDHAb>h1|yGW>ED<t>EWiR++!*6
z@0uY3;>}LN2Vu>$;R*GpU3(4KiE!l!fPe&dMJUoYr=VZX(Vl+U?mEezVaPq6v8ecB
z=rVmVuOwLi*(rO7{%Q&Z!$V!}9N{lw-x=#5jT1<fgeF^){0m_)$NppVbPC6xmKPEZ
zoniPNVSXlEvl^a*;>O4QfA41w*w0}TRGi3-MGFBo#R1RMW9tn<6un@%DdvmB6?#3J
z5y8i`IPEb(Xgu!-VF32aQKx1adnCVjV7q|j-#sl3m(lWC3_aLUCcz2>JBxkd4M6-L
zr;+YOkYI1aJ}M?lr-^osCRgEWRy#K}B8mh3ktay<l%gqYt-%ium8f?2)W6@KL=lvS
zQ%;w7D<ap6ACd`{U9f<j2-N(!$f<@z*}@{`_M^U|R7G1pl)SmnlIcNO$+<p#%I8dP
zmu_4A*1bV4IOM9DSrq1hm!klS!Zhs?Wq|UASknXkezq|EzyrrMKkdhN)0C4t?TD#2
zpJ5;cK^#12X$#rm`cYDQEBv>4fdJCu|7svbifo)Okk--Gzj{;5lM9nqg6tB6MnokD
znPuBOyxhHpr0fuWN0pCvf;A}_spUmTVt#MIozwikMMO1HBw9oFIrH8P9?xvK7LFNm
zF}$4T;2$gi2nHZhC6i$PiPQ<Pj|gpwNb0rkuZ8^IF1|#{>(PRrSUguu;F%T?eIe*8
zlqYt1AL)GhpmVf!O$>Z#^I&EWZ_Zh*i-GGPMSfNIZ|%sx2Ugbd@b~8Z!kMMSU+Y7P
zI5e;>8uC^X>;cD#0oq?UQtRIbz&mrWo_0)XEy#z-6D0%5KL=+I@&ESKgZB5s8(yj=
zJD~&k^eg~t3TRQ^&=>QDv2Gty2f^4i1*y^NL@!uy9z!<uI0ffljRq+-h&cf@AFm;=
zCkpmY`l>|yzL_0s^_{o~&6z0xSQUb1ju4kG*!p%ue<7-)VhW}24XAJe+=bls0mWQ7
zW%<SDe%PEr`ela)q|W?(n4(}?w5AComW`@!V2cS%!YGb%?9(=getcjQw6W9xucb=a
z1NfJ4(-^=d8%Y_zJWi;O60(1-_6e3=s}z`7wm^0M*?CWRA}#467^(;IIY1%R2gr(H
zn9e}AIyp;9*zi|f7jO9`Py#cIQ)T<>^-@6W_~YLY0j++e{Z9KduW`l;ZPVrRTemkr
z%R!f3ky>IU9?^E{1W2WQP%u;iJpew<g0A<o-XhD^Q%IDva`Z)<SZ!gLIe42rh3ikz
zJq1qzeeaYkOL&TuKCZe<u#fneC4A(sSD1p74;1B6%*bk=LJk9qZW14&2Qr($Noi;?
zV-QRovl%MK2w3j`=o6@P0tO0NR<I<5Q{^z|$0HD1yHt)^gk+NmPHnySdw^gSu-Q`v
zF<kk6sufbTVr1y+dF9~`k(lrQ)^L~q1z8j$_8cp(XZnhA1_0H92Z~QORKn<$BhPz|
zHHn)F&zAiva}pW6zifaRC@G9}sM-I_Ef?O>padt<ct)m)&z5c_Z8NYhSrEJ$2Fu;=
z0Zx<oWZ*3ZHu&R@Yw6A=i1`;CpbsjW93CpmW39DFSq9qb=|ILp;vC3(T~OYOJ;Vv@
zB4ad<a>5b_P4#se+%rJ*WqPN3jLp1ndI=hVJPIIYVw`6Bd#FUlSdlNP`2XRHwxtAq
zdU@et=ouwz1{wMwbKPQx1cm~byq$mqzP=~BbS5h0XsH|>e4V?%@)LxzwGSrlQ!8IX
z&Y3>xRWr+HhOU9*=}&)dSJ+ytNaN=UyCBj?_<}^a6_)k_DVPO;OF>u(lk9VU7Pp2Q
zlADeySBL^DPzBDwcR`y)Y9et2scN`8x_JVm@+6~y>%1zt)J)Rfl()W}>DqGID!wi2
zz$e9LhcD8eN;Z1DFPX?n>hLK>P3(bd=8gR)PNqoPM_i4PRpWd{Yqe!G)K~hsRMg_L
z#q8as!Os>3+Apk!K5u3%>Ml1s4Gg{M%s(mnNT<-cqkkjT_<68}M3{%KUeW~5eIfM@
z8eMgPyDJBY$ffoW(>&Ni!n()1o18WO-TR<a);&bz$xK97p1(^)G7JFr(+iEEr(<Pw
zD?C<D-MQL9h4g&@G|>&MXxz=Z0K09#T?!`t*koY7>`=9Np`~<rs^=ECz5PGzy>~p-
z{r^8+(m=}*nkw_kEF+PXQDj6M9J6$c>^-s~$|xM0jFQS8hhv5^%9gz;TcJ>7{~piY
zSMSU9`TX(w{`<Y%zW;T)73cMOJztOK<8fclLNIl;9x&!nef3zta-auJ!lv(Z9h(F+
z(F#N}L}*pD0F-B<JM2c^3`bg{5DVVXvJ&eJ@L<dWzH@#h_1Dj@eoR4r^@Y-0LvRu{
zh-n&0JJMBNJcNJWQp4>E!7ej^4BJ&K)8l?jqKrLYUA`Voe-iyIM%m-bh>+FLn+v{4
zd8zAv2@1<EU>6_2$&j@39_4(@r`LJjMZ5rLuop(lOlg0NI>{7^;9AC^ZqMgSp^0C+
zvkMbFR_o%s;k@&rJT)S|E_z9!+Npg~His`;=sZ-~La5z)e+pq++!W;eK0QaUGU&(@
zhR-!I-h;6Ou|z?i>c+IZYeRKb-k$j9ZtvX39J_;2K8Yev9Nl~7v-cqbUmBftU0_f{
zxdv#FIiNOk6?+dcJpRQ3-LGxb{5x}i7U#U@5zz(um7h6=B?FdFm!>fq^aMgQIpqQE
zs4{>%yV9^6%WJ?PRV6r&P#}U9x}e#dOnZWu$GUU-_c`SPr*C>15f>c3s3BT`$l1x_
z0qTHRG}=Pt+dWwC^qG@;geh#_JfK6-eLO4L@M>7V>Q##KQ_BhPQRO#dW<^;HO0vBb
zo}TYtEKH*QXNI}lex|3@JrV~wHg_!_O*qLugT6_rr`zmd100!n+A)61du4I5H17v6
zy?fl0$leQ0#*Jz+z|quamzTz6g{FJ+URw;k>BX8i0_qjd;u%(pPLnJtW+wE2Z%Hvi
z9J=$8Q9>^~5qy-6t^bo_eD4E}oWs~RNP7zY9Fc8ygOxSVtJa@GwwsdthhS&38?(z}
zhbAx%4G8_%a#Hj301bh`Aa8eq6uwU2$D5;nFaO_fLb2oy0*gk@;!X6Rgl@ox@<tR8
z;$bVUPQ^l;BIZJ^n`2wqP^|RcGAtKsmsu8F9-#Sw>0W+QQOkH`9QY+iZ$BgNDYp3s
z2#<|B+f1&-r0@Fk%XnY%%G2zTN{y9gE%tI*R4XT}-dwmP@bFtD&K!af=h%1HKWC^%
z^9Br`lLm@XjV(<HqW#zu+5J>Tm?-<tTVVVtX7Q4y+t>i^evJKMwX%p5E}t>5=f~G>
zo;&j6G7)_BzC+WV&odWUu9$$i`bxT~n^0S^s-eF@J!ckPY%ef+wv$59a>^<{y{l=l
zxjvT{F?29TjXyIMBMd99&`rlgeN-=ZbJ4h95A@9I^_bw52F&LNs|*ks#VfxlP?ZAA
zH}zcT$%ezYP^Lr9JGa-5IL7<6p{mI)htPu7VMPm?JcV6?E6VLvqIm`IzXPX(fzQ@(
zr><9%!=(Hqq%lDUIIHDY$1iUHcu$r?l*c`N`nI&ADiPHXb~6wQ*m4XvQ7WdQHp%$L
zrO)K;zGZz@wA?y9kbrdkGKhzC%y=k-FwSbY3yw2A&mtI2Kfe^+gfr!rO-a0^uxZXh
z-)mhF^)^Y}I4v2q7=dyToCfqM0+HXy^szk95tGmWG;0T>SxKU`MqmzDD~i*eLPyEL
zG+-^M?8ax6B`mY)7moty9EEOXF{}9nYI(`&%+tqs`*YS$gKbRL!Dk^UdP9-12Uvtb
zeHkXA?EXujh0zu3EVyng&FPLkv!Fkz=uKF7;b?$BWKrl!uMk#jllh<+$N0p+n=xEd
z;lAd$2ERndp#?^$&te)f;q1EJSj!cvD`{!1xQ=6OA~Ylh)f1OV!R<J;bUp;S^BMKw
z9WSa3E0(dv^6{J{*^bv2IL>N0>8Y+XeptJF#6;YEFK?lGSP>&ZW;k7qS2|Lyo~~T-
zPsN9acV><JFpk&~a_V={Cr3s9l^lIDMgt#fHxI1zwGBz#hw`^tS;}X^Rh4XJiE0iZ
zz090&*!Z~@n0FfSn3<QGjDdS{(+#7wMB@P>T7rGE^cZJrv{tlScx%m@45OylV~a9m
zvo9KKNI`Acx45d@=zo<@Vkw?cIa83wC}y_l-J51v?k=(R1cmVn8}SJeMp6r6@l<M(
zai>AudRyW&J(t!P`xEB<b#Tj95v^|L5|aAmU1BQ|=nS0;yoy(m3Gk>>JNprc!)G?~
z)+%Pg*dd?T1EYyd{ys<x4|CwfhVqf*0Gb*WRoBZDJYEWt;X~<Z$>~`o>?tX)>o558
zU~KUi8#NY8^T_;Q$K@o^#k5Ak{g-xby@pY{?fGb%-7b~Suj&kk0Z$|<^`@g7GvGEH
z=v>CewsZwwM^XP~*Y-+MCk4gB#ny`rZLXN>@Zq`h;EkWg8h96geFj<b6kMm-7O<_<
zjcV@C)Nk`PRqPXJuhPfQ&%VZS#CB3Q-0%m1$3_-Wg7u1of%0GaOwYf0es+V}C4Bo#
zv{gw+#6A|)pC8`1EHb*t6m<Wkw8GjhVTrX+OH(^HRgU5AP*SA3j&GezbNl$|8;ovk
z;G<5w5F(~dcnN8oh@>lW`dTU8wj%oc%!)0umkgO?$}#R(9CNEOSx{g2bW5)&lW?{v
z7prQ`6LzagoJjf8d)M-f^%$R4-=K(zWud9P`ML4Y=!upVfkfHLXVu#A@sL3GD6WF0
z#p#0`k94GS8=mK1+nqQpuCW$s2DJSAsNh4<t^Fl(BHXq=Rr^o7{~)+NnxIMfseVb7
zifhBCOo1!rIKRo^?}SE1WWv_neU+0ekh=+m!L4m94-b|iMNT&_N9jbVwHWA26e}e!
zscLhP*i~Q3Ppk-hPI&UuZtLVA&|$9vVwLHh)F!#}k7*mkEcC&t;e`$vjDhSs5ZGM!
zNGs=VkUx3cOf=6mOeB9hTb!KE$CZ$})AJ2Matf+7feKj3&wUGbb>R-=#<{ppc96*r
z*WJQOr&>?sh4EyI1=fyqHqr<}(fhj?-q)ZwLv$8ctcLE&IS4j+1679LKpm3U0tG!r
zB9{Lyrt2;V+Am+u@bjZQ#FSIv+~w&S*$j?b5>)(AlH!KkVLiM~a|6*!b?eIc^uN;Z
z^Q4isfrrW#StBFQVxH2BocJLI8JL2Pn8Rf_h$h(x{Zr4<nbtCw3-`m~dkZX{q98-d
zxqu8%hh!y5E~+Yl9_#V-7?U39r%X+0G5mK!U41<~LHRDdF5L8;dN1~^tn?ZXxdWcq
z*)vp~-i=!<^S><L+fIyEjCAVIVK3|^nz&57j|skXo3ruE{o2agzk9|%S6m3Kqcg{_
zG!<NNG`ixj#KAzFt;xml2IJ$7b0?G;>g~4ZuBN5IVJ2r5>2dtSRi4&v`FQ@2a_A-J
za;+b<z39E%M`c{|Io-K+-1S*6SL++`fj;2WpQf4P>j^{0lzRMS@V&Wr9;M$8`PB7X
zYHxQWXJicLf~uJ5o)!=*9HhEYHPvaPDCz4V$KM_@Z%t<r#0s{KiSM=0;|oLJuR^Ux
zityFUCT23zPaA-cduLe<6xm!I*GN&!TN~^c%1u~7_%G#=zdmUV|GwTIF^8NOeu<pl
zDNU!e=bs0>gZz!m4nmFg3ai#WiMLfyK`~F)XY|H4VQfS4gGqdHW^{hn7DivF>&$`#
z-3`L;;3j}qOS-4X^d)bIaQ5=VVYyTbEp_QlBiOMsFo98?aO7kRyr`RJ^yJA2!RFcQ
z`Y<>k`Td4i?7QX7=DR};$SXIIGBZ%j7dQ;y$;GCrR?cW;18ly~J;;*c)k^afPE0FN
zk>}bfmb|9{ef8k7<eN^0EI7ct3yx2p^i&Gq%uR4{7gY0NYUK<uInH%MIx<JUI9U_6
z?IU;JQfz-)$Bx1adQz$>5UuJlZnQ5fv}Dze=(9W*I+un&r&DCD`TEhJ({tR`hEQ`0
zAhVO_6W{z19O4)d=jM6?devD_uI~G4%uE%!QHiq?QmeE5{^?5hJ$WHY<J<twJ|}1@
zErSAlb|1)zx`auu9a*#kA<hzTQAO9On*?zX_BlIPD#*0I=JiFBcYFG-J!Lw;y&QiC
z>#+YAFEbSpTKD1t6S!vIC!3iiz?hO1G_<duZAvGMC)Z-W#&n;I1@f$!bRbMNyf<mw
zzZNrhBI;x!6Os8E!*{an)mjLiC(CQuMFw;aiWT5PF2%=j2VY9s8DOt1EkD4hlH~E+
zDA|0td{A^5^wY)iW7e#MQ4s7nHlUObq?RNT!`vXq<RYM{>>74YTBzTnmzBPYK#!Qi
z<ZA!fltH2B%0e$S?#DrrLwjR9h}xUt{bvF=V-|nFOmz9|B&a<dY3X^f0m%QkQ}wt-
z5P!0sK@d7EybB^aIoHy3V|@0JI&Y@ufJB99$|gVtPH1-G>*Cu2-D2^VlkPn3Smn)k
zG7U5Y7KW`hW{*Sg<oG5I{Aa}b&+977O6JE|7k<Fa3&y?>3HU(avo@URE|uzl%LsJF
zCQIL0z3ZWy{HP?P`Heo1c@_ZI&t-$3;w~7w_x-AO4&W?_?sj#W?HeEZ2~9$&xN-N*
z&y~IU{K_1hqy<XDURqrKGJvHs&;j$0=P1PRE&*R;Oz#jmiIuU1eA^gXZ6aQd?M!MK
zbDm5XQ<Eye`WO7$A`7(`fEwbf3c_>jKz3A85r{XEhu31Mc@AA9+O=Uf)-?il*Qmhx
z%V&Ff98hDE6T{}c#2T1fBF5iTZ+v1ik&=!i_cGMpzM`4W!6If>+;4dW?LUI>!I`G1
zWEiwD79~1i?knR_bKXM7*MP<w(AnR6;ls&0F#4rNaA{whqC>bg5_G#~QBslaQyRfK
z{}Hrv*Pb4j<p;j<OX$vqGw<->BlS_*yJSx$6~FCX9VoI^Cxu8K!a>i?o95?TGds*}
z2DX$}`mM2A_fp%WM;>WK-W8K&enn+Q;X#O%-f<uy3Qf21?;7b8gVu5J_HA;HnzZL<
zId&$2TVgI4Z15zY1B8-GSM}oI+gK_l&sOGNEFEsrY*~l#MyI@5T~glBd7>*O_{Tcl
zDZaTNEyXP_i1}uErw&M1h@}oe;=#3{qqQz)x7RY;2q@TDBa`E3jo707Y6NFi`xB&K
zgMr+|V;HC6Sdi2C>1U|lF^+eL<kBta5Pcz!-%u+wvc+G-zWM7lh6|5GQy|wnKUF%y
zMNA<|GCs2MyLajNI819q0FjHm=3u_0s%43#a@!qyg0TUpzPo@R<U@}$n2Stqe0xgQ
zs=9sWkrBa^emqZ8CKC97u6Q<*NxMs%$Vm#3=L&q@_PIRsir)^f{YqW9KB?$o_PwlG
zKIJ;yiVetgF5OO6OTCHHY1IM$T1j*o4!_E+1Emqy<qjR!F^hSiqD(Qzcg0M+mo;hQ
z8QvdbA{{y7Avmte32jYE+O6<MXe5(2fsLJMb1&yps?H?tyjs{;g&+V~=Au*c$_Hwk
zZGw=M4}wen-ufggF%tMX#nm?_pq@w^gb#8e=1mdGfhYV~cy%SezRnRSOa^R0Ddgig
zNo(KTgT7iv4KM1BeX(^z!~6^2raMu@4R=!TrICHe>SPWP%P0jV+J~wawCz^N%22yw
zl#4iJ7`wY2h`9aFuT`}P*o}fFz;4%w!|m+E>l#13v~*UCVoZ(}X}vHkP%F0Xy3w63
zS!0~0{Bc>{1-uD)>3`jPdfuN2bH;6!NQ4nZ88ZyndFe%aVH((hC}ZuK#9Mg~!lRJq
z3S*gtfN6(yMF1Vw7qKu|w%bw${^W;xA*0=L*WQEsiwb$Jl^=q>+Jud@j^{wKMi5KV
zwA*!^5R%pK5WsuA&&mOHuLj4*SF!HK7!U_E_)B6;s=o*Flv^{701gztgW2nvfd)UL
z{D0ail3;tq&egdf7&8r9+(x#3t`~;NO|s@~K;1YtDZsYw_u=wk+{hJz=zs4#Dqg~{
z_ugdFaHfQ(aYh@BGQK<`k60Y><qDlIbJA4QbU)=OFS2zI<vJ-bSJxFIc9+-FZj>#R
zfiity9AC7;<9`_E7S#+SgahA^^(KvXqv`mM#xem%Z0LUh9KV0cD$S4ccyboyZhx-L
zc|$Q~AnFSc=HSmmgu>k?MZW~3pMB;V=6v!r+csemC9)R|t_hm=fH{ViUCnwVQ2K1<
z&zQ}4WNuvJ41uw4Nk2_5^b>tObF4z#r3+&$#YPmVhS5GWiRZ4a<N%QWxI$H+1r!Q{
zD>0(3j%v^qpc)^^mrcn~^Fjw-peD3Py|pX)iSOoOpj*FMhUSGAM4l7clfPd0aS8>+
z(f@0}L(S+IyNBLEbc-gTPa8Y=6$Z>!-4xSk=QHWrg<0ui`#QvG3-z!(AnV~fps{?Q
zVA-8@rxYz8h-_@Oo>XH1`55dkv~u#Uf;WU9bF^(T&G8a|)bU5vifzZA&AsaLT?FGm
zoOLL!It6SgPcz#=titAl&D83|vero92`0}2j$rq8X6p}Q5Bl;Z_9RiwKn8mZc~YYz
zB*w=|(@;2;yRf}m`Z1mYkUOi5wB3WbqS1rhdzy#U9A0dcIDKPKX?TN%^MDMfDXZr{
zUtjOIsxYJ0#MEBgz04SUoi5P|I@YVM6XWS=7u0ZYS8}T^z3ht#9vhC;>`o#34z@fe
zve*pk&y^Ql7RGXndApWb7KEfDrP~ax{<Tq@hP`2|fr0^T6ter^gU_u4C8^+^CL}G%
z#ST9&P)QRDsXfzr_GbIbA}!YGP<8R??8KQamU4FL&{k*wXWcfSmDY*Fgx5#$BtFi1
z-}D*)xZ;y?vMT)<M8k&n7T+XJO!M}!joejgfNtkx>l&!E#@E9uOmDx+13Jf|eIQV_
ziH$Nvpa(izUItZ`u)cfKZ?3Dg?GL@xdH(qji;W7Of5*ieCs{VD*D^kQEu9Wz8i?`y
zlKOg}Vz=o#)_DplC5d8y-h$E}w>!yg4ZTb{OH4NC`hIMC*ON;lf%lI;bpx#z*>lhb
zoQvR&^OcY#GhGz8_rs)H=_?YH!T>+M??PfNqt{g|o2D~a+V`6^!CFZI0F<xq%8+$I
z|1VLJZ2LSQp$4yFQe}++y05A7XK1DA4)`DBKPi0ti|O-Zbwx4x>V(!`Reb|zKl#2%
z9~p%X$^QP@&##jYYuNgdEUY3-Ja69iBvsr4?QbT$n~va&WNsXnj$4tQi8;SRF8>0f
zJcebL#=mO8F8H0cJ9x9v7YW%<e)9^xRp%ij!?I`>@#9w^Ikm2biQ3hu;R-`DHJM)B
z8e5{90T4`)5$9CjLj|@1H220|`qGh8^Jy~u)&4Y@us|*8RFgQVjm4;4S=l%7ee?vD
z7@`-?znUo#SRj8otWa+Wc0bP8Dr*RMH4fAOwXE(5mP8hV7IUH6Dj~_Y@-CjxXIMbn
z>j=^7NXvk!ho5=x4MM|#{y6h_MXoyL2HYGPg)%lMZ9K-#`~Ti((4ONpa~L+8gQ5w(
z=o%%_HOifhE28vDW;_p$Fm0F}0PU~mu2^62S$SBC5$%H-R^(_oULTPMsLAEk1I~=&
zPy&*5>aG8Ud-fvej>}C1{jxKliz6m0Wqm(F){6NBv$pQ;Ztx%cJ|+SpQ?8cENPnfq
zF;<2fwxa!@)ahFRC{aHY?7Z>v2;7VC>-5e4DnR|HEI$c3fF4%z9{3j>wKRH?aDEMk
zMpix=T}6kWSfoSWm!ZAe-4ZFzr#_zLfatzX$grGSd;Rqx>Y+$OH~VvLzP$d@EV}xq
zJ}kPEySKl!&Vf4^0}B@v^AF#@{|L?BmsWh{j*;0-H0DB3Nz?W4ziw}=Eem23ZWZ77
z1OY@Oc-*B0w-Jlr{^<Le=kO&l0Z~n7|Lp7N2HV45hH|r9B1d=S0{RzK5xQqX%*kIV
zJ~cv+d;}{1g=WO!NjXo%<gEePp8@kMGQ-{A{eJVKAE#dZ#oX<8<L!QeuPXW-Z-Ju3
ziG-R|`rMBZ1zXw@D1spm9!^J3&+?Yzris4nl!YR_kZbZX2zc#!Pv|9@z-9ZhHg5iF
z0dm7%)<qn8^Shfr4<9I`8r~n46($2(nArN_T0hdWK*dLM3G1}mk5DPZ%z)df5lB|c
z_{&a9H>VOF(-LaJ?y{L|yQPPqI?P)yP@Ez={GARaV5{+`CTG9HdgzC2{*!)44h}=6
zSExLBfqAb1xG54(fh@eM&}vv`!Ml3`t%W$oCf9$xqbJcF{Tnwv^BLXIplEuDGt>Ll
zC&jmN>Gho7AzU9GRN<W<A@pUvrPh|e6++wF3n>tHpij)Emt3-c?8lj)B>qkc>ek<7
z$2kNZ|K2dAT6p||=<)M?lRf}K@fyM_vp-q@_iivI--c$#R#F~bL9b9MaxnCoUv`|y
z0S^g3{ZKQ9T`07_N^KLme||1v$7<c-rWPme68T+O&~vjThfzf0CMFti6eJAmf`tl0
zui4Gj=s6{nX^*5oeGgHz^E5-&>rpBEy!|Q#!+*c6#M=kx=||g3J_Ia54zPr$E^Maz
z{N4haI_MVpUmYCqs<&*T-=h2r)HzvHnRcc{%r8M=YVn&|SQ#hivoX2GIwu!vS^YQ}
z<oIuIUuvH!;CEc2L{}n$XpLe_nZsB0Wz|_IHnw)WX&Tv<fPX7;v(2++?wexNmkcCF
zjk9JLt702_IsTB*>W^t8FX#cElwlJbV`E+Dv9VfkyK2w!B(&|K+_#wxT8b`B5ETDj
z$f!d7DOQYB#^~YPN5ByCwQ`8^vp{wt=#}s`vh>-z_Uu2uaYQ(Bjl95W*b=NWd<VFq
zz?8|PE!BXyL!=(-mycjZa_jjsQQ(3?PLA=qun$efsZa!h);@L8f4RH`bc<0c^L3vy
zp?;!ev4xr=SMmP7$6L@-`+H$PIipYpQnt9t4y|n!YS{89H3sNU3)|O1xbPTvQ56nk
zDCT3eggM5O4WeWvhrdCq^KciuAQ>?0@K3x+2~7qf!m<LzM*=^Sq(fl|J}8t&r3Kwj
z+{c^wCapU2Rc1W8W%o!(*@!)y8d_`Ye^?rm`?`A}BHo!AN0bUS{#R`QDdK<2`x!Kl
zU$`CEFBMw_b3YG$rW-VJrSL~&*0;;_23}4Sg_rP(X~;jPpkb|^-7r5Q>V;gT8MwUP
zVl!|Yl^zfS>3Vd(Ir79K>q+4a%K*-M70n}_n~SF}1L-geu<v7rgEyoVKLK)%wuFAt
zo|*1!{r8K%zpZh{PS;=XsvlxY{J8Dgil7zxpX%^75G{Y6uO-p+o;WYE-2=Ow(B5?k
za=1?SqFEYP3A@CyA8qnkq{ESm<o0LvUMrpIW}iVN7ZhHD{MW8|S3_@m$JPX76s$Ii
zk^PK;ZXewuzg27vAw@|k{NFnP{*RKFKEk%Zfg>i1f<^n-Ay&!5a<DPVe3|X^;~Z)h
zZ{7a(2gBRHY^ziaEAATVca=s2@avE+V<$Cdc;`b9$qdfeNZ~_6Dnj_&^)j^t)?l*8
z1U9R9^v$E;FAY+N;DlA~2y9~?NLmb_cSnk;nhstCt`fK68+!=oxDOI4sK8BSz;@*b
zZpBI2_npE#Lz&;<J!HJKACzYW7X1@_52wxR0KUEj46;*2aQCM*=v($NOqZ}R%9j{L
z7X9;{ke@jWcqtVd;tuX#L*2PjQs<sVLC@z`pB&GUQH9Cvfv_E*W=Lsl0a%QsVorl|
z=AtgA@sJga)CFKAzHI&JJ~m6I_kb0TT8i@A4%ya1J!C3`TVAY#;>@!mo*IqU+I#iE
zVo70Ir2hzhyd$=(fzp1;lhs?}X$7qr(TL|p?r1?XnO(C;(9!h)VP_1U>o$d1;Bdty
z5Dvd66SN#KftBWzr~xzsD?xgs5<DeF!1Cv!E{Z|}yqSiYr=_@Ls}mpqDj(RJcNC}7
zOhU6qT-xN=YR1yb)4^`|B>lJ+5$|sSSQ_boY64zZ9_(ed6~Suw`C(P#mw*>EUrVeZ
z8Y{e;S4QyE$^F~5rG5W%ryo;VaIncN(C0mn2dC4xYF@M7DSqqij>0UER-ri>?=k^p
zq5#s(OB@aQJmE|T%PrvPiyIt0IkXEhQWaI84>Kq^dFZRP-)>W{SAnaV88B#M*)qRY
z+%!EHD9x_>a2PBIN^eqdd9T(=VdLw8NCU|d0f?a$RPz9uehD1UMfz!`0UeNjAmR!=
zo)mYtVHrE4+HD%YT!SefwJ%FOp8*`&mVRqNg{vAg2;*s7Kb8UFaHOk1q=qYG#URZ|
zF#pLyrL8@w4hb#Hj38X80yp=OPO;6vQPS&jab(j#?F`C2>Z>-qIPTe#b9<-u#%@Xp
z#uVu`WQFv<$a|oyTOl&Le6`;Fw1BJJY#(!ZZKXe_*V2zNZG(_zvwYY0`xEy*0I1@#
z_4Z2n77fsEBCL|2yA(kf=ww(n6}bDY09z~5>0~KMIbfV-&}x{WKvoADSE3)27~v&U
z#wBnNAh%=qeEauzOBEvl9AG_U$a7m+@U=T@nV1Mz70MrYpPxw1M8PFw0X``Na+1sz
zu|7JG9e_@Z)9`vm8KNA)3hJXEh%R)vP|Bc$C+Nc)SC(p8XPjmdVkK-P>?($GO-iQ@
zsW&m(%s~u>jd31qhG;yvbOjGI(UlDzouZudDHZKBSp_mx=q2;~=rJ_kpp42E)Dl@4
zTm#x6;l&cr)(QkIJSvkKrtvr3WPJ#?D~)8{%bVPS>HZPlA@p64LT$DA%jM_aR={Zf
z=~DXcc+T)zjCo6AkaV80!Y7x%IHAGG@zeb8mWTTDeAGv)Pct7y3-%qVMC;Sa0{6Ky
z4FR!lHrYw%&t}R6j1`vSIphee=6rBtlt)$6x5Q&XkITEmLyqrO06otfcXMZ~T<1Qq
z_O}*T!Le}U3X0^>XK4S*aBu3Y8L$=#Ek=`PPxpiLnfpbh#G2@QHO)om@3#s`eQ*8k
z&WHUX;B;c)JVs<WoTJu#n`m2q?n<JDSr&Azd)9TZhhZLd=#(b{WcRUSC<7)PM`U;h
zQOS7qbKohA8dpEAF*B3%`g0n%!^$j)yoD^uG~_G=aRBekePb~}CUp=hFmy}GziK(6
z%FLQ8$r#vTJj{pDbi-;BW=1REZr7i-iTrq0<s~5XmSRt;nMg9KyW0Uv*NF(sCZ?FQ
zCcRn_Hf^PC?9fu;n!v9HT|Zn3J+H2B2zyI@A^B0#;mad_Oo+sqs{o^w{l^0QyQL1H
zh%I4yfGQdo3m&Ei`A1&<S_Q2tMT;Npe>Z*D|7F>ra!iLS8lb-Tus8H|&8H9fGnMzA
zTBUF#EyJsNY4q{qL`(ggTUR2x6vWO-tfeMN(s!=Xrj$rxkEJz-L~!ETnjZb7k|dp+
z<!m|#J)Q1Pf|1?;9MiT0FtJf}cI1unF7=Y{3<9QVYUa$!3eo4^iPC)i<f<DjuK-0r
ze?Sbvhn}S4DfwmfcFp#a%qES}rWT>iFJ=#DoS?yhaAm}YR(C1=-nGm*-ls=p!@rZ-
z+a8}y3^{u6q32hTOf0q*g47t(<|qrCM3%yDEg;@jUE6e=7P-(S7g*75soRh*@v8o4
z{HYk96Y<Xh)VpkB3Nl3{w=m@wak7UO1;-`V`JQeitSWGk?;?4HnJ`~a^nTWyv!B2+
zQ1^Ii(vNfAxIXA#_s$6JU0vKMw+HCnU4sucKF2<*r?GxyDOA87O7rmRaLY{1hxl5b
zirNJP?Z09BxM|AA+<KM%4R4d1%J{Cxi2K~l*qw~3yF-dro^m&606)ndsP>#j@hiz0
zX5^S8Q|4Z{MRVNkQY5ahoY#4b4V=eUMizkK&zs2j#m;Z}X8tN_TJj{?6ps62h@hG+
zx*jX5z^BiADW&B4dbN<6fU@Q6v6(VVq?$m|H&qyYb$sQ|EXGrlj$DQlIT<OcG7({j
z!oANfcRUgtcFO~8IR9ArZ_m#%49GT=BL#;EES7)u?)#@>+(CBk3RFLqHlHwbQ;0|g
zw6_1n;1#K|&U>=2D0d#{4C?4ul9w7aG*wMLc4~8ihv)`Kg*tSDA=+#D&0pC*{S5oA
zsZNHeVGg?cYYl`;T%McjkGMkLo{c0*Fe?1@3K)_O2KlH-#0z=c<yM%uXzMlft@KB>
z7AehaC8Bp-hAuvk9h+9f_u4cN6z8QvJe#jpxYJfc#_iXN2gLF);S(*1-_n#jzQ{-1
z-h8!l={TMz><rTqFbN+)7<<0Kc1yHVU14@WlESPHF5u=Df83+m(4$#12rK>5AA8|h
zpkQUhy!%bn+aS%3@iS`k*Tr<muBhz-fKnf1S)6{Sy2ahvcz{F^HZ3Ij&8jtkbMrw@
zXQe{a{hrU{P`_s;Zl^Te1VCFiLe^-p;(Ixmh%F$ng!)cnR&7#)gMNg68nYXpc%{%Y
z+?o2qKD(8^$1CDMe#=U<R?zlo^SLt@$(qbtU3<k&hs~?Kw>)Qo;s7tv-<-3H-#M=b
z83%9*?R)+^*JhAaU=46_U6I)KP@J?29Du86XM{KX%G#>LXu|xejIU+B)h=ea!$~Oa
zEYXK4yr+d4hiQl!+}6;Zff*hKQ7)!myU=KV1wLTNFIE>o-$R1lsE8KS_%`A@Pp76n
z3FUS+#F&|OWw`XbfXi-(<Z3W+e_58#=6K2Lb;7-RWaL?nGVOk)A2SQ`pJQZFF}#L4
zL5vp1B6SHh6+ffa+4bHyOP>Q1tMPl`Cbc>pcWPOD>#mXKE26?xAR@2pkxCjyQ52g4
z;I(?ZWJoO(RvpNH8b#XL)8au?YByUd*_(g29d#v-xKvK?=to5N-Meel4B8{604<$}
z#KuUM7VFH)A0mwM(zP@N`v@$zO?v+Y@W??6&vLHaTNSP75wxN=6{=>709$)}rLW|0
z_iodoi{gE%;xgG?O63hU{sORHiOo2`6(bQO3`#|!9a@}iiCKr}?-aE-9LxR^tf>&S
z-W%Ub-Q<dOIFk7*Ip7dBLfQZ=>;#nDN_fLQbsUR7!maZwr{#e^XRK&amr2?uWtoTB
z>>Q_LA|bH3ZVo7n$KT@ma__?CKuM7d8gu&!Ani>+f8-?{2_lE+rO;H&by^kZ84e0p
z)I{w&kP-Ek#>C_LGtsJ3WYUq=KA1Hj&>O4}FK?SZ$wQw&y(b!yd!kO8*$+N=zkS%-
zwMA5o*vFROY@m6K-%Kj=As&GNo#ZiGT7;S{gLP0iU1$odqd-AMy9y)%#s<B(bD3PY
zBLy(0`Nwo4*g@4~0|TB|(lsInF$GgEh}X;0WL1bh-0JEcsAgjaEw`}?YjkarFJ_WC
z--O{Yc>=WY#Y+yG5u6}%wwbLJwx2p2AzqiGxx#SKJQ@-G#j0iZ97=(YKu!n*h8x+;
zL#CTul0WCil^=<{c0gX0mU(Qh7`wt|F7+^daTyx>-A8a_v&Vu?G0I<(^xrrMe6?FI
z05yMx-W_ChSK%4=@_LJ-zr#qwhby<L%)fe``&w#$VA9`%%mbpJEPFxC9d*HEaI4rv
zCK83Sd(_#hh#S_2TU%C-+a1t4)a`h#C(vFcuGpK6TWNPsbz*U!C@8RnrqY+@2EX<d
zrnbp$X6h8T9Y{K43`A9k+{ci+&F`VEGt3dH#Fn$K22AS2D>DY3)XX;_(9j)uF_Gs6
z)9EqB21IQgf1h=IzdvoNQEYlrq#iGYHwpuyJ$IAY<mhK<I&5yrf;8*J`r;YD$Qz?`
zkC-gNs*J7fl!)Mbx3s_JIDy6K1|y^V+`{DSwktnedlxn=A;q|51mM@Qz=z^pn_#T8
ze^70RvK~uw4hmFB(e8q~??_9|tDCdh$G>vzaAs{%{Vwd8TIPaZd!i-}MvS?pXOCPh
zCYablYvgNB1Y#zuER3@);6!G7O{qejKG=4S((WS>+Q*C8nILH$L>#up?4@){_|{F~
z%O#h@U=Px%X#r#M*P&VQUuAH5bykh~U)n%ZBRECPpD8yR2GCxIVPf7S;yqDr(&7~t
z9_<2iSx@NLHzDWuOQ&6KJyh6}H+|ZKGbxyf<o$f%!1MHqsrRC~uKmmTUoZN(Yw(xj
zHy>7^J7T~D1^q<fj2KI+bY_LU7f1Vzf;H~GU9-qw(EsepgMn)$IbA#EB$5>P85?GL
z^K?6X+*AZA7seYDrTTL{DP;3M`W9(jljB!*kh&E1mhrjFt2FxIoJHjpL{&ovwZCwT
z*uZV~VEhFM3_P*yN%+{^Ow?J>*|l2Q^{mQD(}|+^wc;)-7#j-#*69Hr?UkJeq-*W@
zISwi%8)MJma(Bxo(egf=FugWz8auT>liCN7J2N2s&VkwTLh_Ch&n*v)cZ&cr2zl=E
zH%seF14{h8c@*;DBG9046R-yX(*FH;t=Aj7{5hX5?a%K<$DTFEWP5XdOt`RX@|_<{
z*U~=zGTWDm><L&Mcf+oe{caMGAbT!81X=oZ(;Ap<Qyi3kgB!^W+PWiOh}a3wD^Fc=
zL5%wOdx@I{ReIyAz&pANmN3S;CsUGV)@)w;9Y}-d&JEuQ{U0;Ee4Gp<N~Bh&+0S1p
z#i#|nzNFaeNUh@u+_~~xsrJ<A0S`<R=qE6I{EzhzYz%(J?hG&yTq*1=vUy=KSkj49
zaMp^+pjW#KZt!>&!w8o=(5oBgf1v!q*u=E3GP|}8vXKIS?xY`l$|3a*E`l%DdYG>l
zxv6~8C0;q|rFFD=y3=KN%ki1p2jN$hOyi9>U;=1*c<@xLq3d{uAk(?%GibJXn)*eC
zYtK*a#IwX|w>JhJ&>I=qi|pIpF@!Ti#}InA>MrUcS`mYkfD=hnHcJ6b2J4*$j;x|e
zBw|j#`7)q1zw5@>jd)>!(M1>&dqEK5UGj;CXhP`Yp04}_j(!{ZWfXyAhE1lj-@unD
zd~3VW_Vx8r?iTgO^UkE9P%elh9ih`X{$tK;FUxbOnW`A+EEN91r7BYky6ZcjAh6%n
z%~C#PyAJqh4s0KIHGab(RWUv5ce^bQ24ui+_-fI<{@m-LvT%J;f))UDW_MpvQj%6y
z;rL~m98nFpu}d9|82^j!6v#T^Sf}vuQ4pQ=!vOnq{2RRT|LH@_cZvspNZ6#6i_Lz)
zewH*DX7Urp$Uev|(hbsw;{(||*!SqKzmuk0c`1UVC@Yz1x!}FHbVY$LfZ6BQ)AB9A
zbc@&0Z+Xk6h&lw(!CB52I{`Df%Ea~8p)9ZN&_+E1cH5hvxE&b^;f22BCM)*RZ+6=i
zJ`g|fFx)UZc#WAWiJwQtqK4mzDS9Q4^TFOjj<kEq<(DN|rWiMoJ0K04{+=m3zxNKz
z#K$_BF?%N(V`yy5F(>)=>4BG*4USs-P$`z)g$J}N0ld@GIV97Vmkl=9z}j$E3a)oX
zKC*trU{}=*<nUBA6T;Tb^pU>iREU4}p&6JV019n5e*r^~shaz_4p8tZ-uFnxkOe`O
z_s!NPeSXoUG{yF1)|kRtgzL6h^tV4ZXSK=byQ$}*>=3|yGh7^(f)jyhlLYZ>0`eZ=
zC)}MjD*9nu=Oa;s`=$v~6uhn&?lXgoMkP)+z!t0Kj&V38P$}*p_|lMVTa7!d0MDIQ
zFZZVdk^*FUkz=yu9(_7w^)O=+FC}{|V+=scaH<RENstl8Wy`zj{xW+Q$j<1do+M4-
zG%X4^QTw;tDvGJl;qFSgJZrZ_YsSyXT^bl8{sx(-h2w7->drV}eWpZXuZ=vtF|s8#
z`;za>JZH#XBid_Fdyqf>Gy5|@;-A(SQV1^Gu*d=yfL<-1ij9c;3)pID179loi=+Zd
zeBUHQB3!^4)^;q-Kysip*AD}d^Gxb`s_zg)&+{6k{#sX|y$&rjU%5D@j^9q^w|v+1
z9GnGR*b6dNr|2DN0ZgRW#TX-=)Fe5&ERsf!C`(p~#WQvrHCh2Ud%YBrteE=v)R#o)
z;{a29{VzT85FisHnQk;(<YzwJhN4#&;f7UqgWe+RXNcC}Y2Gr>j#FQr201EMxK&6Q
z`_Yb2S+e_m5AI$PntjLkKK2DwXOJnJ8J|UpV%KC>OZnt69V(*r`3hrTbNE75fkv9n
z%}s1?=qc5oVLqiM$c}hrvSR8@x^5Q~?b(UMF^Fu7&D@psq&oql{Ud%IY$tKnufDyQ
z07YtGqf%N^H2I}*Eg=<4<{7zL9N09q_v?`+68IrL29K+Y9$gP%G^wOORr(L!@|#-N
zt~tHOGOUkpeVNmD^!8y{@}1a$l;cb{QIvpLgzs*)Po+K@rF<lgsx}}B<?oG$nf&)B
zp&1v+o-Rqc{T4|mBK}j3aqGlGDD}7lt!nR*=_!)xD9F-E2p^N!Q>|ZpIY!#gq5Mpu
z=w-r&&SF@{L8V3AX0bK%7F*fwx74^cfxPP7+g7R9TX=Ub9YY(>ZziS;^Q1OZZ$%-(
zL`^^d33+$@749(IRs+)DQ1;v70f`VRWt|5D;z4;HcvRC7H<Gqrxx^*-%g_7fg%IMz
z55}QTR{wRYqv#*Ds0Fg>P+d6i2-z)ea%G(SxeWTnNYHN;HNLD<`QU=ng#UY+I`_{2
z`1@PlMe4`QHo5cY8+t=Ym&<PV`uz=SOe#r-QG7Ekmli**RF&~QJi2`a-{;T&^CWhV
z-$b{s{^kBhXvV5Wo##c%y454lf*ZYe^+1ZTCbPN$NrkP6ICL)hF3IWyHdu};Ognb`
zGmGvFM(?s_B?Uda%SdthqaOIhud_g3+Ft3*z=fS1osJj^f6Qt}l?!=qngI5y^3ugU
zlyHRSg@44*vYq@o^AkNpz2@(zIfB!c4&w|e-%G(CkC5{g9zG%FRs`tILe8nk&A9IO
z?xErVx>Y=)^U)6+wV!T$rk_{{=+s{5kve_EFP*+KSj^->Kda}A%>m&$K`Ff40{Lf#
z?N>j7u+nCU-h=2@5s?C{tML;(d0X)PY~ScUUw|dP<m{->O6-1Eaay17Qt#!1i^;kR
zR0lXb>h4Gl1!dRXOdmLV!ehGp$7ShrC#f0bDcKHnZ=Y5V?}EE=^x6DA^uX27{u_XU
zN0EGSCjexCm>$M$%EMP6wb&&OF%hM&2RQtz$2o-75aC)9Jlr#fBjHJDN^`=WF%q{x
zprHOrA?9&>D*kyW%gzs+sRhz_xitGv`?p^oMhSlSXFr#n=+}>d_&fBZ%Uh~_&hb!W
zfY0#Udl=UyT2;Mw`--mu?9-YNy<*Njz|CC4C`KQawS_O@$0_F_oj1z>i-?+te&T=s
z*UbR<YS&*<8UJ2H=yy6gicr9M0&)j_&$^PT?VL|-Xq?MCj%|x(A2a%INwq{C^zE3T
z;SI%8ygRkeIs8B0?zFpfq<zczc{B^48-RY_Cu{g3ar@9uVSv|Xgdk1BN*?Nif~GPh
zeJRy}sjL!_#Md1oj6+@$^tYvrKud=+vsGTU1ZtVBTY5W&vJ;l==aUFh`3_<sx?twm
z^JhcI$r0C#j!2>9(HlT+S_ss2e5e8ZJv|PB$8G`3N(41UY;`1TRjlt)zXwqPtyL#L
z*uIKkyW>M&_^lKyV3BZhNnZ(WOh`)Gi))8~qf7*eUhR<Eq9R+tFTnEV83Ut-0h@IE
z^meCNUw$e?H=4mV=dU9|J9JOrk2vC=3i>Sc?;nmB?UAQb5gZy?AoRV{2eW~==nUK1
zoxbW_ho;%S13hXAh*=>@SK3AW+QSdA5OBVGv+kTAL}Wk)L?&m{8o`hAtI_`4L^=g~
ziG{Hy<EAy9LNXcoy}Q-2+pe5wv-E7+r86(jaAIL;OFA+;L9Bao;1`yzVo{TULo?|O
z?K$11fbc8>nc%eBw8!-fR|3Q9i3RG#QWw|T+;N*8pow%u5oh2r(d9-CXst)%A1T0h
zbN1t#J^wt}9b|Hd>XyTJoj^|;0}TU9xsOJM=507R?l1ZyTz|9xS_)5_AwyK@(r2IO
zgr99vB}ertJjxut4WXdC>6af>dAnf3(FF7bF`7u!?+RmUCUoXF0}2k}Nnw0qaN3=%
znO{Pqy?s$4rXTprFV0Fw%Fwjri+ir}Cb`jbs&|>{Uh1k(=-ABp37sL=J(zhi;H)>K
zZec4L0CNv@S)fcWTCt)YBL8yYs-eF$JyxZUmgohjDE`zsAs>KQG=dZMbRutn``+%U
zsB6L68j=FYPd36iP4t&nr(TSHRX#qa{_;BQ8hFa6aW|O@AykOB$)I&T7^rv6m)9f6
z?g6pVn|u2?m&XF<vU#p8TVEE}FmC#IX6B~Dw>@wUTm<JPqx^XLn$~9JR=~|i2N`nQ
zJ+1OYFBkTl9&X8XFD*2#_An2db6l?(<=s399-|~s?ETKyhCo9B>~Rz*e9kuMX15|p
zeVR@sY~yJq&d|Y|m#aK_3|-hcS$I0ep%=lES_;aJ!G<|Fkh63=QrmW-=I#<5=_xzV
zb88fC?6^?zVz34i)1yP6<8OqpH`<PAYsQAyEx}rk)%6RPlgusBduFa`|7_-pWV7<P
zyb<J}{%lHR=d$8n!poKWwCY7_>bJjq^l~ql50ood-8yjovVAS*4F<C|vvS&W<?h^l
zY(y|tv{P_^aw-tWil&L?cr#Fe%56$TR5CCUldTu5{{^pH03nwD&f~%8wU?EFixLVG
zkdFLiy$O;iWO=jXgHPubJGfn!W^87^b+Bf@1auJyL(tynn=>{%m%8XVHeWE=qWG#l
zDB$(bt;Arl)>#L*r?Wk*gCcCiUu1eeo12u(37tisuB5wZonBtEW(@2~@g9dZ(&j5l
ztw!3Tq@Z1EvBwyxK5Q#*JIMZLvnLd`4r0}`Pa%fo$(zVp47s~V$k82?3c^Em0H8RO
zKY2>!Kae?U-U4=xKIm;NfRiNx5YP`!c`K+52dS<<$j9Es)V4?*x>l;^nxS3TtRysk
zq)@jzOBWsGdv=4;A@!T9ve*{Tm<2YYSC=_%B!~pP7~zJwzNZLxIQlJYR-+FG8-A=o
zYLt&OZCdDf@K36l^w-TcErL_RgFzWX^_zTI&o?%{nqDSq$(wzg-P%2HIi3UOw$lo?
zTH?OkR}se{P*G=Eu;HYn!VLEVVvCwPnw7UK-Bt|ocDF7q=0Y|0e{$l+_IiH?3%gOW
z>BRTn8trf>cS+=QqYE>BoFzLtWm3$6WX}@51=bTdT>GdiVBR_}?^i@{>-GV(cZ*)f
z`iipE#&So-QS5O<dXf0sWzy3QyRo)DhltsW_EXAiV&uQ0=iqFf6U*~Jjb`W++n&9P
zk_yDYBV9GJ;3;AAJ1!H6d-XB*-6WNY)Q)hdH~Qarr?CIE;?oID0MU3Gc$?BvfB^V?
zmL?Di+VdkdRb{+ciQr-H#d=m705F#eHXy;sYokjF$X|o(Cz698DIJ&zR)&EACt6mq
z@#a?2IEPxJ*8ZVeE{nAJq;JEwPfUm#xGYTMwOi_&?GCOLZmFeWY~Tq@ETpv>8vCej
zWfy#^n^>{t4%4uaEMiAQ5!QkOl5Ny+PtVjp`{kcK^na1?3JaqwM05lps+AgN(HVcf
z_8y?B$$F`d9AB)E$P3aq2ZIOMB`hELpb`;5%D$(}=Sg8L*wZgUn!S1>1tkknHick~
zXfFIrqPVArVE6{~e?*8ZW?0gz$x^@WzPjEYaE{F!LSn!K_9$BKWuUEY^kk&}FnEoo
zn!_%M!(qL*btzb(M%#a!2u?9{b+RlVvXKhIgv+LuR!1MXEOqit77w5HY`t{3;BHsm
z+-`#5N<Kq@OC3j$)Qb~FT6Oh`gWs5FaX&;slo-?yv8hXN4rF-<ZH7S+kz;$y5{j%7
zM58?wsI91ufe&$BdwRL`7#kovWCj8F7I{bpg2)tPjz#cp$rl33FGtD9lQk;WPK~q6
zcU<MngE9X1q<E@bSl6%6ytfUdxFew+iU~(KrrNMS^(R4llGVoYF=3t#r>3|FxDUfb
zN%b>`O6b|ph0S~ROM3RfSP6etKz9!OT0!w~2pmWRC1n+Z`wtbfo}A~?$9j?b0w8(^
z#>dis&8r-gD&rYqlZ%jnB+P)j+Sto-{0&@BVHTOrs7G6^;4RU(-88t(UpD3LKYvtU
zh1hhK!XvD{d(JJR-NB_Qy{CMzf3`M_E91(tXL`~)*Pu+KV&JQu`oep~A@bcK{a^Ig
z9IyT&fEejhs64k#QfAsG`?=g|sG)NEH5`$c^-mkPt$&7g?h=BR_#j}ZMjk(Jk)0NR
zptYt$^MVvKDA@>6LK~nOdoef@+6W}v6kpS6mHsmuf_WPtNwB<ub*@G941~<xigbWG
zM)o{>$_}Cgi-L)KzcxW=m5;Ir+^ZSYPVi$5Q`pBqds!b-6}rc&wbu<2LI(GrC5Z;&
zFDjiU7<QHx_PzJLp`9S&GW+AQXAYT6aQ&J5^`CPED_=DRjb9(oy0bg*^iUG_Dd|W!
zw##yN&|jklVnv?WvxB70rMdN*0&ds-&7q}l%j>nK#+rlauSy(E#lX4TYR)w5#?-{d
z4m|VmxXLlAodlK@TkmUs-n8F43^NP>e{||{2+TJQrRE`kQU{8FS*$Up5xV#hE$upQ
zlIa4tD`oX8YRRy#e+|t8H`|XgpEfn~j1;wttZzqlzYeWUT9O5f*~U(izmP?3M#xUG
z?JE~%30e09qps1a6Ex2dWIEXwuXWg0N~W?mW!?_1(tajqqPbLGF`|)4^+B^n)#}m=
zcKQs_$3UY<%`kn?!#g6&CKAW`PT)9DDQl;c>>o2nWl+Fm<Quj<B;bDmHgR$uZ$KSR
zqmiDjTmZ>XD3x{qL!xm7Pa}~NLOoElcQB-Y)`}it5HUf4hIhZeQLrdwSI=<n;Y(#E
zP7SPS5nvTmJv?-pL4_pK92eQ)*-$*0@U^cV84l3te|}_?%tZ4ok07oXdoS1W_qz{i
zjkh06^tZe9P5<JuSG}Y4R>FNog_1zdk<bcj3J%>DB&?{b-zl3PTkNe#)Ht<N2k72}
zn?Z49m-#E_7h0F%s*e9M0`a23d1B_%2{edYLQNY}<6&)Bx&G3JReT?^3*ck>;pFN@
z&IDxIeE>~DFTivIdTBS6sV;cRvk=A+h6#4IQXB^n0#&+nc{%npy*LNEXY<L&{O%2Z
z`v~*V*~fYi+TfS8pP86(Z<4P6R}J6Xicz7vnap*KCStc>i&M_9dXqI*RqP=hL*MIa
zwVGiEp|Up}VYg|lxNu4szjtYUn%#ACWt6R(cP9AsL?8HRxvVxN9oJ<JY5QL7E_&Gt
z9Q|N&9#1>xX{}Kid;AMMpIVFnFT<Z13iTTj*U=?U|M)C|F1Z#qS?Ol>%N%w$@vR@1
zfV{rs1SQD~Sk2~<dtl;wSpGFs`PI9L<B_vg@Z4&xY*e&Ku-TlZdqf@y`TYH0ewy=0
z(x2_fncAlY))B~bxD7O6DjW|{!6<jBfs>em?HSn1-n;3!X&;=>GhcjP+kABa2PW+d
z^->k}TwKdz2UwK+r*kSPqtcEX8BP>;zpH<vymskZ@3c=OtNZ31nFjDd<3aMN`(DTs
zveMv`>8%U<qXfl*0W*^y%g}_06TNc3uhYvOO}hti%zMz^QKO!3$IX+G$q#C6(J%sp
z&li)FvgkC^b*@gpe#@<f#av;X%mB9~h%KCIiOGas^;J)){q%@Y(O9_LK2?r9O$Cb*
zGLnCwVYX(%JXVxs(pm}H@p!+2T2A76xn-pL#*8nu=vQsEg(B6=)}M)Zk<r}*nRk|Y
z3jth-OW$xp6X98wO7i>>oK$q2IWQBHOhPm#PGj7+CK0Y+J%-5iPdVK|9*TA)UgN#P
zzqcp?d^magVGlc0DVRQgA8?Z7Hu^-6w~uP2Qb~qTBW{Rf<G{l;q81pj&%62$oe~bu
zm7bT!*Qcr*fJU7N!qaMd39SDMp%cZ13)-9Au)X+e5HyVlhU5M2=Iv2oOfEk>%@)RT
z+xKMTOR@wzcuvw13|oSpK-UW+-*t6M(<heE$0BMM;Djusza3bN#dd(3a;hj#m=W4(
z>e~j?+b^RHZNGiVp%Umad;km#<zr*+l%c~qqrihZ-!~woj&Gv;*S-XYE*z>O%O61C
z|5WR$@9)DkV8{IwDcb?l9!({y6j?10iZflQ?lDEIh+8wKVN=QnJ4hQO)Q{@%!5MKV
zs*&brebv$i=-p=l>JOUMWs#4x&oi`bfa4|1c*=hz0o%|BX;Fs>CpMv<f3RjT^q|vx
zc(|fe9cLOtTi?}k%<QYtz-(m4n@?xc?X|3yU9<f-_b$y4Tjn^v8F1UGfw<Y~t{P**
zpIy3z2Y@mtYe!X@HIJNvia+I?dwlzBn~Fe@PCmaav^p692g009G3S<dAjr^irj3Q5
zz*2Oi5p)g1x7GMLQBH!B=j#S((ql?P4ILX__l3HO%y>px9JAZ6>Sz-!6+<?6P#_in
z2;XTz-E?Mo<_%wG;;9P;KGRX=NDGUI&jWS(_!ajig|*dU&l>(F&DM+v<)!89^jY!I
z<0ULC>;k#(1wFoaoKCHUHk1yXUqf>c&caB}DSz+!E<k^wC!~-HZ(eFL67ve`nl&FT
ze*y|luSQGUWY=k2S%kopuqZph5UjCF06dgBI<E$ew<L-Woh1UZpxaSFNc#~$x_%&J
z`+j&|;tT2>;&ncsLzTxIdj2}Pa=+6`h6F=s<L4xmvCO<!+ms}xyg^7O&iW!nmo+j9
ztaJfL3h?=(0c!Zxq4MWnkd)Qn2Is5#II`WFPN}C%50R+Tb&84FNlxk4_fjdot>}zt
zE>8=pgajJmLZ)89-PR+^vJWi3^%FXq_bnG~On%<8v|J!y<g%c-xY?k)TLEU0=I0X&
zCq<k;mW_fKG#|7(kKFY|=<(O^LiSV7xLk|#HcHk-C);eetn$8C%stV>tH{5BGbQ?<
z0%n5r^;o5`$mpGHPf)E_%5AT<{|oSLW)_c;i9`nn>vEzvC|sH<SQ$;9vT)o?_DdJ0
z=%Ac|T`&(s`^63bs9!i>3F1g=y0@l^_8_k52bmu)YCu#3{H>ZlKo`ZAroh-V24ba5
zfT}D*+(8zb0gl?Ur+;SSK`{H|X@j>f>zQ{W5K>bBp+U%RZ3r3mMw)mr!Z-}qTLqN;
zWrLJQgxZuU7WX!U3A!2j_W7n-L}$ETsmffKcCQNHH2u&8*IMPK>e~gW_~U(}<KZ>z
z#F@>Fy=$Mt)+`nUvxPJkCwR;OJ<9X0X}HEooGMNJ7H-~=qH-(`XUp!$Q?{2-Bj3xa
z{!f+Oxi9)G(V7xf1_U<ByU~=vKoQmx(hEG>r#)T`_R|B9lfr3cKb@_v6QgM0E4>p3
zE_0+(rJDW$&G5AJ!59xocX<8JF8^>(A%SZuADm+~U{p26Fm*`Q>Z|QzF`XGI>xaX(
ztiXPl-cN+V`PCOX(sEz1-D{N4exOsGMHV_}9R=Xr?95(J;}TpZj-h~KS0#%lz_fmG
z{vrw>41W`^if;qBB?n3{EZ0xcRSa>w;%(w?eQUKr?g=m5DM5YT26&u?=;RzQ1-UPQ
zShe&gI)5x4!1H-g{<|eli;!}J!ihsYBZt?p-sf=#8*#m0m1)hW{-veG%APG_XwlWA
zeH_g-O#%}|8Zx<5atQ)+D>zMp3|A#2pd=b@Sx0D63mvIRQf8EoP~XMnN!X(BT65aE
z?doV*Fj$dE67S+cC`sA58$)i6m4-L8i|=U^iFdO*)jrfO;mM%KXtE@IDSaV=4jZ_g
zc>4-cR2huE|1B{3)6UCcp_He_3ax|9<;-CgC~1Fr`QozV>os{kKcG0^PlvOw%lJID
zI=yiVI0^>Aqx~AG81q4tjS3PND!Eh`W0oNHO6c}WQ1M$@@B<5etD{_jer$7s=2{13
zqcMU}<-w(%;{bqYAw&oakS?lnoc9bfc|o~{f)mr|#+A&UZqOKgA!~Ja;N$y7xp1;T
z2Ul0)p$$qyF@j;u3Bf)d{j#aZgn<>>%d0+AOs<~+oB8(qOMz3jdSSWC*0+qigV!%6
z4{kndkv|%KsSv(MRo8%hyP_kIXV;d7E@i;bDou1Jv|%%i`U&c<mR626FhRzHu1Q<6
zKZI9WIYPKpU--GL0+T8XN(H*0(@VR(tk<D*ARONVeN^IplL_S#sw((a1xoA#S$lMr
zNU8NtLEXwUMrJ$hm$$~mR;QQ*t!gw{N3>sG--E3mfb%#l-!4G}z)VKoW~PWA3>B+@
z;*`jL2S?S}hTX+t-8aSs!Ai42-DstIo?`-W=-_KVc0LaEnQ$V@%t*JgJ*O}k+9jUO
ztE4EO6>NVE(xoecA+@$!u6r$pzNZyzC7y~bl%O`d;9NG&q0B!LmE~3%Y_V!f)Yjh^
zxb3Ji`BW-^Q>)Ollid}LV^T;8v3eieSG$sy!RX;~k?txJag4*+hgP&C@{~$J0kqch
zj?%?6Gbyo?nh9=?T8;)t<6D=dI<+t=C=RWhPjdt)9OfXd%l`rpk3pBod`JB|1?oQ`
zUz5nZ!<5o|TL07wM_g>pr#t|JO41@hAfXK>=4C+prE)FVGz=LL{OQx^guqm+BhRf{
z@Sz<nny1;i0Q>31CWdoqcY#&+n_E{#jI0G{{F*he*&v7cKstCCOc8nc#{=!fyEf!+
zLH`eC3ay^hN_p={?jO5iwmz*n&j;eamwnSU<(s7OV!jB6hLPrNnP$YGHs>6Y&N&lT
z%h>uA<-%RQ%Mj+n%9#r<%Kwobv99nZJ<fg)%<Zkr%SV@QM!bb1z|7&nI?g>ho{<2)
znTo0Ju*Yb^@pZQQU6cYjF=c8p1(t)t0Nur=^RnMemJO=;+DU)*Pa4sm#`Y6>FZG&4
zY~a==0M<NG9A0|=Mf~E3;&tg$JY;|w6Cp{l=%S5j>#G>y7_LGYy2N1ELUIA3h_!lu
z<IPc@J~lS!z9R1`wZgsD6|<lJ_B1|>CimXTDfr3FV!7KMnlp5rf%JJwjbL$MgU9F0
zI}22Yf@_vc=DUR`UB{Txo)+*)qeI}1Wv00)jlZge^Lw(RCvs|$7BH42EG$x$F;MHv
zxkDHNbIxOMSP47L#f3e;x2pSpVgyt}ZouMWB=*0A9cN5X*9^^KKeU9mp1|>n*5?GJ
z>6#9PrtkuiciH*-GXyE3!^qeq#*)XwA=FQ1phn+DACe@5j=h>3_Wl3kE$OT#7?7c_
zWi|!Sev4JXNpK(V9f(?{=d;zquK=8?$W;lr{dkTJ3~>-$6)d!q@io_L=}6d2-(#~N
zYOjsE=#zKc!S=;(;2d11-mwiJx&Hyyyff!#9~89|1vggEF1U82=?$Ay2Y`ztu!|en
zu)LO`qYwj~<L@37x!lO3;<9edU#UGukoiXSUi?_|)si*nq34dVif!LVPeOy~OM~Y1
zzwN103b(&_uK>Jc?WqlYHd0l;q3})#dnf^`jZ%ltKuK(6?0(V>CGR>p0t?&di0&g8
z2|r8!S!f8>`Jc9unyNZB`+dyRUhaDv9XG?+vKVM<ooWN<gT6*m^qOz&%o86y>ALym
zQHC_Zr>No8f9Q{s!T$nuWndX4n~Yfh26Xcgpv&_9-hZw{;s^R@{GMie!zgQrHljFD
zw?}9r+Ko1%Te(O6Q-lvQ!N;TT=8tZlH*NxbC?CLjun>`v(uiHvGRQ#x8GS_Z*?UGJ
z+i?vqH8hjd_f|QfZ=r%FMKU`1ze(KxcM1QQssC?!3PMu1plMJC#0=#~W=eRG2`%0n
z3jMFYdRu2MdmR>+Tnt^DH(Fet@Zpc2V&uxFpv!YP6<eGAaI)Cx;MpFgC|J2;wq(wt
z+gF<{%1k@BYw|a5Z;KQbmW0VNw16dPiU3p3gbc4Gh~oBQKY(NnS1=Siw>J>mm)rlx
z*<Z&6nLTg7uplTQohn^|w4$Vdgmfw?9ipg6NQ#tzNQZ(-mk82=(jZDJAdQ5ifHX)*
zK6B9Bb@%go-_Q4X{#qCDzR!KmoQZ3$nMpeX(tM?ru1dNl7Rza2@CFnr7DL6X9&`Ze
zAgu+bKO1<cPQa-e00v!DG`|}+y9{NXWt__&jDrf$19h{fyf)!yZ6M3b%2~MNm^Y+N
z0MBs(PgEZY5z*Mo8)Pw_3<xhk!y%!`z>sG_7Z5ulw}e?R|I{lo$n80ypMB7_pcd|@
zrjTn1Ax0P}RIfb<q2{keZuJGXAM#3g$hm%HA`Dj7P!>OW3;nLdLl~>{&*QwW!yd>;
z2Nk3J(L_6Uz?T@&z(g4<sE!4w^Pr4gQ3+3C_eWH0ohBLy2dCJ1=29Lu=<niSGQ<}6
z{Y<Z3WRrHWh@lbh$Z<Hm1ch_nQVh3oP)!GlNw30JcRI1PKxA|q7->$ZC<5rF@#jld
z?HT?TYllVP{g`bjaet@52)KgfBR=?OUN{G>65mucq>NC;X%5=w76H+Ki`)E%zB)X#
zMQHgCCII4E1of&NT1EyNQ5K$EA!Ent@UMZ2&Ra10BsWmDn``DeJ{3<9<Qkwl7KW%m
zT291&y7Vb92V(3okq7?!i<g8DO@aPUE$FifIGH-pQZ`MTG<<IeG``+7&2i;=-Vzyl
z9!7-TgB_UfP~t6yu{oTy(mxE8VRw{-G5$2}0GkCh5FHwYpADmx#T$@uV0HK%Os+U<
z)mpSLRDKo0q7K&V_hXO@Bf14;<vmagfd(e)*H0HGLCs;b-Pj*VfF%)FUtZ#&F<74%
zTHo1BHg%jd0&i3@AP%YT+0KlxZs?0}du)Yh0%p$^q8tf?mDeCUtlUCfNl6Zybw;(~
z4+~&~_(p4Fk~`poSb_YU?s8K5iwoew){TYdUBt)-g95f8fr7epqu$`@Dh7Mr6+0^}
z=RNn_XNSt)!aVy^|1Zx6s{E``l!i*JA)1UiZ};=)7+EA@fPg=x@V_k906GjnX^|xw
zz;0y*J0ZzXjEe?Cd+aRBp_%~Hx;m3qnRt=@-W>ncD)&+}V;yy#L6`d_^7RoHg;vz5
z2mt23o1=dUKkP6N#_Jq7PMLtEeuV`Nvn6N>)QqI&6;1%yS9yL+2xwmeeQy8_OoRwl
znSYVl!5Pi^Mw6qUmK)I`A})N$N@E+jiO~y_=Ac==tmby0!yg9#-ugP+GmAt<QRE!R
z<6=Sudim|X{C8a*Ku;6n7(J^B0zb@Viwdxz!unRNFYXQeT6`(ZFKoQ>CTL70B9ftw
z4al?UtvzM*k4^wz2q<VlQ<|zZaKJja(-~>dOzm1(JEvh;kq$pGbmxS2DK;=pK<C{V
z7{ErpA|j&WNU{%?<qI@NB^`Q@t*Vb%^*s&E?}M&?kv(cOsP^qI9RYC}QeomW{n#k(
z=pbvy9tKw{8`|(g_IxdEgX$=#$~$C2P>PL;HGem&xbvm9y}AjKOmAU?(u>={M)<d;
z+mivaJ&&4lLxp+c|NqlKS)C)xMo<rnrUB%cG1$Q2%i`<+zI@(xMDo>@%*f0kQ;{#g
ziaAinye#SF3sQ9xWUUDvTQ~Cj{&fY2mmrHosrDj136{!Biso;ksYz%YkLXmKLq&ks
zQ^rHy<V;D5hqSP|vb0)kAfHXXB4)31UrH0Gx$|Jk0(LHHvvGSvrAj}_Dt&l&*Tx~H
z2)Bm+&!d>Y1QTT7)yj!1%EXa_LC2`)Kbnqf2o26lM==*s=7gC4VB!<%o`~8|4FPwF
zL?T8J|6mFwG?dJN6v=`aozi_wFrei~s6hi;fi6+p^$Pe4qW`7Dagc|`#YA}^T11#g
z4?<6xj5R+?H;X`aI7SjM3I}~PQ4=^Q%2d&pTJ{Nl9Q_ENqzg!fL+<g2-Q@OTdbD`y
z5fBjnG4Bb0vyL?tkqLlxki|rI89+L{nU3@!&x5aSQZ|AASqD%I4N0AYE>mjPbj;U3
zonko$Ezn@1IO(dm*i$hq1gE1SW5?<Cce}NJNEQH?KakPf6K0b_+jtxcsWND*<K#ft
z9L<e?z?D%@8bR41sDVT?psgUTH|^MZM%#eW$gx6pOt)!XALegLxDS}tEnw-QA-#GK
z-9&yA9sH9z3X~{R+DLMxpA5-yBx(C(8w5l15N;I#Cxaz3rUVgJYhbvG_-n||e_@R=
zFoTw7PWnv0w~O?#|L_J71fWXU2I|&*W28>~3yO1rB8s4lPPHYthrKvly7@J9t`j6M
zuR^?XtTNht%uh0qWD~Np^OYOJulKjB_g}-<LGHSWu#ZE+4&VBqQA2c(l%@qshj}*y
zmh(U(aGWaE$upWEA=f~s`_I9cf;#b){czxD%t((wZAm^1d22#7wMQUt5i5A#hnW=a
zOAby2pfKwYcU=XOm;)0pZ&1YYKK(r^;3f)o5#bFKyQVs-z~MG1X~*T7IepBHBk{wZ
zL_cqNc;jE}$`0Ry8FDb80y^FpBb1<%<#a8qt|O!dwU0oV6RMt5akf%rep6$AZC)4D
zwp&8O8CwS@pox4-Z-4YJBq#xbh_RU;WROn-S=-8-L6bv&8}oQ765_L<#xSuEl>$wy
z2Wa_fcb3-Wn(=AC$?Krqfyzxr4!6^pKR<w}F%)e@s50=85}sciIrpb7V<0Aj8~QYV
znHYZR-NgzsBK6CEUpS{0?zd?+3@*TgPCMs~05zBfFM8Qg60~B+8STXIyTC`+15RrP
zI8#{Z^d{=tf}e=Tj#5`dI*$FB53M<9KZA=7OiW%<;&>H6$VGd&c0@0pgfoU~V?hfQ
zb3phjQFjmRtSgyCfuE>?seIFllnq`a5z>iGe~Uv^N(&V)ZcaX-vR4TH<LHS4{J`zw
z>T*HUKgmF{b{a1bR(-HjKoqgzpd(lSZ)8SZGNzmK0qYwmK@_5uE<yu_U`IOi<rP~#
zSmHrRrydLx6S&!thD(uRBrnAOhrcDs06Rn(iiLKKh;$QCrW(T;5vd6cl|otN2jn==
zOHKY7rZw3*>{1^X`sW{QuC)mA;wbcEKE@;hbnxqW>n`>Im|C&DZw+FmS##6i&m97z
zf~qnT1p4vOXW)@M6?S-mW@Dn(JV1R`Vr;Vm@mRvd_{7l^E0|YaqpGtCN-MW?wQdbH
z=J_$xnFBSUJpc*aC5qYnw@x_(uCoy3!S*YUz!|;<4%Zh*f!F{pw!()i1<qCEqZ**l
zAWouo+1(o_&)?ELQ%b30EI^NlTQ|5_>L_$0Z?8lBZ~5RnfMM~SAhT@G9+se@*?+F$
z1nL?em-JB-y`P1{8!)0To6F<#FH=L$7_L8I*I-h(P{$2QMG4Dkad>`DMFcR9dBCld
z2ijpS!jpAm;oCk6e4RtnuaS!aFwN7}WKxt!|L0A_P{NPjaL^~E0apwf-JqTl`cR)W
zC`)n#evRQR?n^1jRc`C9m!^mrXHy;(zHv`hUJ<8)rSAu{-7f$eTt=^|u5}$c2u0R#
z8vLV5&@S0OKw+2qR6P2h$bcH4zRjcIy`Ub;6KV1w5kl(vkQ1~T6)t$(O#<Y5X;G?$
zFbwqtD<+)zjb|nMN7lf5{DN4-a2KKH^1xlfCw+f@!y#`zbSg{-qHF?nW<>z_bLMzj
z#|ePIW=fkt+5UN_IbAgRdkz}<s?av1E_HzmL@uOhJp&F0s)Q^8)<D_mXlSpI{&-&q
zV4y$F;sYQ;O75<O2ONaQEcrKkc}}8qp4n@-eQ1_(ngxvzP)*a0q5<s(<0XMebLmTK
z*^*EAkNy5~qNSICl>{Sq+!k4q$*gg-!g&}dvtRZOSDK}=2YQ9PV35U@xk8@r{j#r)
zP+J5ipyS>mYnqN`)uKdcIhU#6!0UUhRjXoRkK>=6U5$OPvAu42?Z@)@1koT!shEIQ
zghN132Mo4{P~!L9?s*&;HoGJSp1Kz%dD0<lc9VyPq$x%7b)UfhdL>{(^qT1ty|d$P
z0CT&=_GH?EK9y?Jv|vNIO%RN0lI(Ss{V5ebF`4&l=zU8|%X{eA7ZDMWynq)Oug!Jv
zz<-`iUARPxULlE*>J`V!yYFVu422P&p-Iwr^!Ard7(ees47%FAClGv@BpX&o<-<~$
zGXUK#y&+OdhKP9@{Quz>4L|={>#s)>;_+G75r=+4UoovU0;JmpkQWjfnqbgJ-yNUF
zQDL)>)L{av6dze?G%p?!Z4ar!x|WmC+Dk$=iXZRaHmGpn&#m5bTQe&B_w&j5aQG}n
z%YVIW3KwRQ5kR7k3JS3NK$<lJV5209@!TPurwe0^%6=H-$(@1cUdiDNIZ4kp8tBUf
zb`a(EhBjH8RQ}m%c;IynFUBBBoe=XEur2BYnD)twU{F54(%~HhLU;F)jJOYax?c^L
ze}Bk*o4|yQ>(G9TN6(#6zT<su)3S-Z^3&>S32<!iB_t#+C<i(G+Tp*>(}2Auyw30G
zTp&8mVXO(`2r4S7SDp$y)z^~IJ84mOzIWi$@xv5G0h_@R5jws3wk1D?%r6>-JCg5h
zm}l03Lnq_zUJ;a7v+$>+NMc-ojrsikv^^m@=t1@+h@VGB-aU|4Rjr5rkkHc3*Jl*m
zL`$iGTw6Y9?ql@gG*H+7nY9U!{eo+sP8c3l&^Ic9Fytg?5zWigf;Kcjx=Wj!v7P&u
z^mv`brY<ZYi7~tklF}Yj1I1VmQEpmg3KEFY`ufj}Uevq7i;UR$V4x0E%Hbn=fQ>WJ
zCY-QeUUWC3rInRAguOyg{GF1Lf;y2hpFiJYEvo(R4RhClBU#M7V5JHtRm0;AZCOmg
zN@ab5da(e&{s(i{vz5_bX?w#2^DN%dQ!3AWmzHd=#@@(E;KfpSt{=X?|Md+}YRn<H
zZft7O5Y@3bijV)VOD2hN6Z4o}Htw>+!8!5a?SO)J0oF>_qNwBdv|36!`g$r~Omf@e
z8jzcagoNCMckQ#X<In@U28JE+yRKOH`uUYU58v(Q$ah&bbB9Dqmey2jJhY%u>+3)K
z3hmNA-&J)Qgg{3pVHJtlNK#GJ{p}^_oMzURaN&C{R{v)v?_cZF!4-oJSL~de@1fhB
zL3euZ+Bp8j$lm1G<LymRr@(phG=U+i-#a0JKLUM}I%?a6Fa0HG2a*Big}TtCNy?TQ
zy}gJf{Mj?m9$+3bO@Uu>0%ir_YF@rvEI<$n>SamO8z8GpLwgv<x6wf&BEe;h6B(yt
zZ%kBIS9jXrCtT!+*w|NeUEtgOSQr|(Ph{El$47xY^MFiD+v^E)pha{u=R+SZaCQ$h
zgwg8|L84fcr}Bp;%c)a3ZHG3=@zf=qF=b_Csn1y8{Jy&1Z9M#O6~f8M$ll+R5X-NH
z&_NR;h^ChfP3I_q7E0|9@DT<3hov1Tt-rpsh^Ho8c9yeo8Ex`XzLx><Eu5<>tE=6t
z-G#4YYopuS)rrDF!!lq4*ce17F9lO4^Uy)7yO%xwcRF40@$tI{t$TBGzBOxJzywc8
zOH>EwFWGd^>syW@w+Jd%-Hy~`{96T;k()pV56j6kw98kjYoGyIcWyK!sI@K_>`}PE
z-gZ?VvjO_X8c~26tX6=-cCL@V0(!1M5jZv?ovO6H`;v5@QNHk+x?!TyX0^kVDp*il
z2&L4+o-p&U8k9n(yH_`<-3k9Cs7dA+{#+tP2C5-Qf1JVsHdwOL8=zKqY;Jz|AVcv6
zGPzgo+nsji&POZeJCzSZt&|hQ^zLKq!PKg0QU$3QyYc!=T5@Pv-8D&bK{yi{OPWIr
z5Waum%bVLL6%`e`Y$rS)uo#q1rv!-!i;2hnDY@kB&%hkTbF*hiqdRvMt0&JmZ*MZ6
zrlk<<frGympo(ef>6taGJUptqI5;><(a8=pu+*d2&=AuHMsU6oO@Iy;szjk5?uQM(
zwjzRX-Up~tFI#%_XsE)KX3w5EV++L~FjK(L@3ehzdU98`-eo_%m1;pCU^wnvyru9j
zjjdke0}HA;aUh5zP5|W+))1G;FKHSXc?_SmbPi&)<=+)PeY$ymeEcw+)*?yx&{2ed
zeraN&ikvNwUlIc#%kfq>bkKj%z)^Nr6&o8noVkZZIdkW=X-weE;memVe+FEQB3l4L
z*K~7JtAEM-A?(xmV2wmC9;~YSw7P*oHf9Xeu~3m#Sy)-U@^s{)y7>CH^xUGrId<$y
zgPJ0BVq9D>uo_c=EFTaOQe2GRllm{2W*;en&&t5<$2d6t5-G`AD%`8bl7mKj?Qigd
z@){AvER3=QE;=N|@Vg9E-MH)*KY}OU#jg$4t#_Q<P$VNfoR}hw8k6FEo2jWO&wWDC
z|Gpyt-XYi3=l{<M*KlJRyDwuS(*7}OKFPNYeuy=^34y#V`0lTQ`t<&7ao`r0gXSzM
zccKygTTt8MYoI|By8Y1m@5)e1N`o0rBs7Ky0N8WZF7Q7NjwD7RiAm$wS*1^pH2>C;
zi&W50^++_Ao)4Jxte}=PU1VU#%j{JDv-=-$VfXL-+WlWLa1Ij?h119%0@#~@m9_3>
z{baK@3U_tO?s7o>-mj=gK*#A*)qlSv={cycD`$o1<^GW{6Fn0^xBDtw&%)^4z=;V9
zL(StpX<%qjeNR~&^C{VXeiu0;H9DYhH=7k568>JN<Y7$aOTnm%DL<6D$jHjxF!``f
zWC#7q`U)-aAv1oCg_X7cE5_A-s~EU5y<pClpQn#V{;|>o_Z0;i2@e`@re!51y?8DH
z!8a73Cu9=?f`j2o|I)sHzupV8iV0a8?dPlisLd0^(_gD4@bK~Z0njE0?kQkA;gbfv
z*Kqds_Cg(Nwg33nzc!Tu4!OeYxDd`?N*9(yQUc8oyB0zruuyu-0I`|0ogF{ab{rmR
zjUfB4sByvr+%40E)&73K$RsAI7^?VsX;^+9!uPq!R7ZCgsCnW|NloR43g8A&Q>DLu
z`<J&zzJv$*%S4c6b%mL#dIO$pes7jVVKAr$ChmDkW90YYLBu%t~o(iHx$V~Rn?
z)M4$A`}ddD1Yw8LT?mBOF9qzQ3v_X-a7ha#;y!zp40MVNaBBD<2x$;C`=8cS1ES6k
zf$rUf!Q%)K;<*T2{Cxl<L*+3p&=SW{SWXrBfPdV2FJ6%4<@D}v1{}Y?Hpz<$A6W%(
zGbDj{wginEU6$o(=@Asn2hGUu&U9d-uCU{@1+4$<$FE1a_`&7Q(>+3nJl5ZDF@er0
zF3?QF7J{A;NaufMZG=tZhcbCI6T$@uDtG_aWkJyoHDoeA0OIL`5T+ox@0;zAPU1!G
zIFxvDqq?>KdA9&!3Wi>kW6;yJ4zfr&O5sq=M+Q~Agb1D<bQur%pAQlGEpRr50l#k)
z=q2l85hCJ6(8d!NhG~(2izRO0*sT@n{2}`9J^8J2M*c4e$E9cSr51DH13^kt3WsVt
zQZVu7pf7qDgt*`BL*vN*v+KsBpvjb8eJcG&lkrkMJ$I85)pmiShz4sKiD@=GWNS!n
zD!2~qk1j%Dfo3s)`|l(FeN;7^$aAYZV@v$UO7kk$Pe3VA2w3oBfb7b!o(Vg!QSurE
zLzki?h+Y5ZO#%u@0I=oEAoxV12@XOpm3R?~n?2d@0H*kyZz}yipB;2Cf%yzJNWlg?
zdITZ`og4C@tG0Y!WL#W!i?F@sA76m}=&k@BJ#WtWDaF5aXN9f_WIcuKCvc#NXUNEi
z2`^pdxb#V3dlsM{J^)mbK}6)sa%f-VKeGJm+$yBtSvp*YkN#l+_#g`~@$K6fBx1U_
z{S90^!Sy=VP*Wp0)Ef1_=_SbdNk~acck>WcM#Ds#q@*Ot`5sdgpZ#?)5B@k0mjCnK
z3Hu{|f1u<lW_o&h2`DiQC@ragwvEpd!_skgHmdh!Wn{3>WcdF}IVvILc%xWY=Wlg4
zl8z}XEIi$tM-TI$0|2(aH^Ux(vImA2Ffb|sSjA`AOg<%Ieh|z3{dU-r|LDg6FLw;r
ztsghmm@ix~+~xADmx7Son1Pp<2pB(-W@b!o>vNecxKtDrP&MTtX<$HuTy(?M&pis@
z1~H0@8|})=%d1PYBo_R~I<-Fs^^%iA-Jk@AD0vEVC9elEC%#CipmmYFnnXxk+^zVJ
zp`E{U&pm8V>RNpr&=cGM+R(@=DP5L(_1|YMT?6GlTp>bp5^Z!mF1X&<v4VH}cPhZZ
zy?(ulFR3TV(u~Q>=|LDE^K;*Wp!!fehph-$N?~uWvCBB047?-T-Pi+dGL%^M&pyLx
z(e;T8*Jlr$dD$ZTMJ#iLbTj5#Z8Zy(N4oHLtNs1mTVLcLiw$Wxd;#bsm6@4&-nlLC
z&)tNzXDN9xDmf~w3Li+Wt_fhvn=sv}0Q~h-DTV8_<Z?xlrguf-9$o+bUL;Ngo1FoZ
zv45Xwz2@%3DFgpFedlRXRv~B3vGvM8pOS^f+*M=PU5Y5Y)}i3=IF<0k!$F}Z%As)Y
zIE2Cvzz;4N<DYx*Zz=aeL0pM%MEt=KcnM?l=d1uYZwfHk_wlhtZT3)QSq&Nd$G+XC
zPN0oTmAw8<@dJ$;y{p3g@YuT#cr2v~dl}bj1{&3-L*n$`&e(_MCR{NYy(lllDj{Vr
zn7MH8$8>USG^9C8x9sM^{`2-B5Y9O^^TP+Sna^gsvuRiC05GG6hE|)e;6;zxboL)>
z<96BaAb8%)mvq^p4-gR{5<YiF)+2=TE(b|;0nO)!R?LDkZWFAA87;zP4w}i&UJ*y^
zabn^VGO!37>s>3=_Tzr{?%jer*EWiX|F~r8=^$oR3<rlYqKWW1(}lsXF4lGXMK51Y
z?Bz;>tz%A?K&aIi#wNl7#Kh_Q9T%i88W8QO5W`aL1c+=Mx3?Y&-SKS!i|ee`oL7N0
zn&BbqvX!^Q+jLP;5f9BfEQ0wELt)qU|8w>q0mvE+`lVTT$aM2fFrb&Xbd3aY>7)By
zLwJ*W$vm@(v>INRTND9e>#R(w+%npg9_+!Nr{YhNoT`$Wc(fR9IORe9!7%M>UAF%-
zZ1P0O&CIXF9$RXgKYxyb0e%O@&X7Ru8}pyv44)szSrhJe=a-LDA~OOtH9kA+1ge@;
zB2;n$4hq$CfJH13cuzAskT|*Fp3yv#DpZ)xbBfk+&Y>8e`Sso*a@n`NlQggHX^zf#
zGJT&u^}PTxvbB)a)f|!@S^f9h=&8VHjq@JLP(uzZ1txH;Nzdb%S3x5ZhC>Q67q8sv
z^BN1dV$pq%+R~dpcRKn$2%<O(v4bt8)a#y+yQx#QY!wkaR&@7W2y4$oGcsG^1TuGm
zC!wZ%n#GDNg~;>Je&j8Pp0I!>y!L@_Q~7T-hfcai4&3g|=~{;`X!{LVmw*qEl9Y7F
zufQ-|YvZDhV~Yo@m!2$ONRWWBG&X$rZgyFlLKy3}kz>Bsc0)Qt<VKWr1L-oRKMS+Y
zxsiDL9VI3u^#LrjCe?iO{lD$%R}^W8-u(0o*=O6*8c0wC1_vLTU^NOEoQO3TmJkIW
z*<FsoqfH5?`D;j2W|TT5`Ky*k4fC;qBZJ&R-FImer6WO-)YMLYpBD4Y*!PBj!W4Y@
z1UkY*w12<d3HY#0FA!aNT?^F+xKLiAD7W)qe&B5?dlto+lVYQxa{1s3*F3}`brKJ&
zNnsZHJf`w^`{A_xx8?Fv3@=T1RnI5-6xLh^b#ao7;=apy*xGSuVpapms|5n-G3tLG
zA}z8-%VYM|XcGrn`ONE&ap@Jx7}|Sg^}9G<@ZW_m7H3g6-|oa=Z>n96KI<qt>l>d+
ztb6y^Dg6pR`+bI?*3Oe`l+{*DJ}VnZp8I>#(4brJ*dudz-XE{J3(%Ep27b#85PNdl
zm>MXG4*|zW^Zv!B!xPvPMF+5htv17r7KX>$*drESZnip27SEHtC3s+bgHE-`ec)!p
z%NXyUF&3%yp(IP3oF5>o8U#6~X-n(4qkq1MeZ&$49k(tD(V%S|=E{dM9_2)FLjwL+
z3y0Z*=f8h?KiLOM`-JnD*|n3Iogmly@+UUyYhCMv0ykPxzq1!beKIb7%P}j;z_9VP
z@q!R8?jx728Pk_Kc?pLr^Kt+6GD+_&5W^*cUq(I_OHoOQ-+n@V?rXKNp*?TR&_M`q
z>*wWpL!9)jII=VHcg}pFFmyAfpG=n@9!zZD(q3)$P29rPXsD8e_UHVzBQ@3@B7X*x
zNGK3mVXSM}dLR_|SWt}N1)XV$>qs&iuM2&|zEFsdK9&L#!>U8WeET(f=ym^Kf3=y&
zx@rNcM^_U+Jj*{KG~|-TGc}zP6*wSz{GIE-G86+O12{4)q8u9ZFXi3-65bZVL-P=!
z+({zXczAexW)ti*qHb8v2<W6Wt_*y47fXZ?cG`d;e~FmXDMVxccEo=DbO9E5qD%<w
zdkvb;G{*@ZezyxW63qyYTMHQ4XM+kCKX7I5%?k<h{VDJ2lCf}JMWr@0=U`{OttmDk
zQ^XH#%*pGWE@}@QeYMu_PL0i;<mo+95g#zH9hkd+l1$h35N^4#jK*D2i=L?ht?pgz
z1{yaaun$YX>>CA^Rn9`gsw??RN^j^qG6FScxzSe3j0bu;$<X`pV{+)Dc;QR@AvskH
zUcltW{DNv3kz_8hdC$$=3Z~{Koyj;GF+42{)^X#-s2!w0ya=(R?9Vee?VoH6tHu&8
zUhH@m+?wb^qr=*uP4U4=+G(h&EG|`0P0`Qb-l-JT91DQTkBp9v4vQIthF}uny2EUf
zaQeS;DK{|_-B%tJpWcbXB+r)yXOvbq8~L?0RM^M%iOLP`@=^`|lAU5O%f<o1KX_J1
z^Df$~gyb#^b8g``?NJpReZed-dS;@l>^*1w#LWi*VteP9RQG~mz>D6lNit*!j_nH$
zoa;4s07L-oJDub12lufC8xzF?kPyQ4=i;fcpd#l3?N;0CeINo^<OsT-*nD(Zr6E~>
z<(dA0a^)fRa3e_xbq@w@7ZKy~q-A@<pHTX&4%4^x?M9{V4+hgtR+!3*3;Nfjn9KJG
z<AG1)0#NpSo4vUFJp}H0h6qceLvKYn9_{dX4ih6UM*OeG4g?{eJv<>H6o8Rrp%jiY
zC&X0_a9|ri+^`E<7|Pc#um@E4aknkq%>b*!0Qz&HtAgU^x!k26w(jdcf2+L-eE+mG
zmPE&q4{IEtY7%Fg^$t`Gas+g&3wiP4DBvsK{`yKTJg%#6j-i27TROcfd3guuREczr
zZSZC7%7+@%olUf1OD>5dr#xjQmVY+pI?&^dxw*KDRf%}LG&g42>5>wIOnV8?+3r8_
zUhT<*cf|8>Wgt+e#t)>~AfPFi-30Ixzr4c->Oz-1+)LIcx-%r9?%7iP;}p=1Ba~o=
z>6zz#Fj<^;p^??d8Cp4}=6u3+PDv(CKOoy}SDm`^UFZAd{;8u=x1|S%Ie_hS*#Fq^
zvDbugCE{pP9Snsz(FW!s3072Dm-T)P@7_EQKLT-;^*&1HPW;mf)3)&EmD2}%5$k2(
zM*sXLm8ziI5v$tRl6m2TIiama1K)TfpWw&u*h6p4G$n8DypDQ9cu7~C1NAFNe9Pcf
z1%Bl8%?rXO(aRQy;bnwc>k1g?05J~^kdG5ay=x<w8q}*#b{Uc8Bea46EVGSj@9;PR
z`8z9^ic+(`-uG>U%lBH&yO~REqslJKYJ_!DiL?$&wD7Zf2noZZqra`vVYP?BmhNV=
zT0!8UUSkNKHOSiHao-RwN?tWaoCMlbe9Zc7UF%_=iW52=zuz~uQQ?!c6EE~+>x|-q
zv7vPuH7A<#J14+dWvraheeBz{GNJCi*KdC-mgXxWecgo~MN=1WlIS6T*wRyK1@|b)
zs~ow_O3}MpWv~Wg&%;8V=p!ky5O>2J#*~0KNZ^g}q#Wd1-{P{&op?}FD=ieUF4BCs
zcJSbx)l;lklZ*9DT?*{@folivG(Fxz0@fG};xmA>qxj;*L%S&L)~0XI_RIDH+Gbcd
z*V~KSnyr&HKT%pDp8+_D^dn#`Wv@MGP$%z>J^(TY*}x*d#u17N9nTrI8#pUAWLMid
zMWTBb3ykt3`+Z_HqK1q_`pDOG1>0X`pt;*;$h{=K8JSpPg}Y%q+tiO_#1^H9onIEJ
z;Vr3BRvkXjjwTmTW_c6RSgxLjz7oUc&tocFO+2KY>r^;h+|{=yfT>t$TQR9zsj6{y
z6|ed0gUt(Mc2JaM2IBerxZqL6f2YLY&N+k7kc_BOZj=-r1gg)esKwod0L6ZI`BQzW
zfRMuK>L?bR=OTst@DCg~x2||DH*STiN0JA551r~X>YOD%cQ(&!jO=BBIhLlTCZB07
z)~Ddu!|7h<;MjV6H-90>2u+v7*nmB?3*)?(4mT{x)f=*~%%Y-Kd!1RXdqfC<!IG@W
z%O)n3eC*qE(JhMVE+4bQWp?S67aqU8fbVxhzdVaJzJU?>Mm%(S`a00or9}@<$#a;<
zg18%#9Ef{98ywYhVu=hda}Z1#2lInlSVmO>eZ0D8*4*WM7N{b~(!7pfVYXUGhIWXM
z4P#}TypZZmG9V$d-yz0gLmM2~z6Fjz^B!Se?IdPXES>g!4`{y5vlvUOS#uelD#qG2
zg`CayP&lLcYznz7Zur<Q`n;jhAfuAS0QqH}w-yh;6z67$dDoJ@1PUagf5gsEA55@R
z2yI)k5EBKMU<lguW66Jo+&6>ZSyvzHK15P}2OPiU`L(IJp;PcR_utGtLqB~1!gt;6
zr;rQY^-B!wo`P_pd~2VoHfHG*luO=W&0k^wV=t@Gh23~->E-Z?jHaI;!V`sd?b>i@
zmMDICL4W5&h0sRv3%)_>F6kbuzs%T7H5}83kvOs(w_y84GQ*ykMQ>FDn6K{_v8LuF
z2Zt3vg9L&GDJi?RK7~BtqwRekW?@w>yQFqrNJPaTZC5w0RD&^n1+;-GRdnZpHqD0L
zwZiUCfIB10AaIkwjIdS8DCAZ6)zxcPpI^^am2Y+Xu;5Ad6E}t_qbL6P$!fZzAH!wC
zUNsBz9|#UQ*&g@^U9zz$a$9A1w7JbKq|hZd;5hD9e~4qiu8?Aw`#~y0|14;5;w0F?
z5S`$r?gxM$_&t?g%>3CTrYFV=i(9=}LsA0vEofGypBZVYN;s|01A+G9!w6Eps_iq5
zoZ~<I)afe>V_i}*blarD#S2m%wX9`vmYXvYB#skg#h2w^rkV?S+*Gd@vu!1OXK!qw
z99pj%8Ke{6QcSv6fOH1E&Bkg#L1#@1e1=QEkF5}Sd!{7VuK=N+%!S{@18Vzt;JF`~
z5{X@RPC8Bn4_<gb<_f2hV0<{VW(~g5;7&6g3Vfzj;Bmv+k5`u|;jf1aNofu|Y+iWX
zm?eJgsJdhZriR=Uqu4GH*sf!;8n+#VTN=lW8OJ6tQMl*F9}i(%j!Gb7P>*)-crjWX
zx8kr)Mt1fwS6A11Z)_lgiHQv^cvF&-C#9vy6tAwXN?TZ@6@-nAnUPaaJlp-aa8&O`
zcx-IY%8I=fCp$Y%a&ofP8@`;yIo;E31O0n_37BpuncAQ=8CP);o-de<*>DgR;v-=h
zO|g|HZ%>x|n7l#qyp;4fL86qM%vsu2;sEg#l3G>XOQ20P$h-Ap9J;=wfym#k6EeCA
z+t!ydao>@Jx)kQ0=R@t^xe~~QLGv1V0G?jE=0pFncR=FwiHoD@{stXHK7hdrp11n=
z)EgYzS6_Q`u2KQ<E5G#4z0(WBl_!BEE^uB!LE#+`9CD=SmC^x!ZES4JpW>&St*@_#
z95={D3+(%*lX}es`0wC&vk}_=>*%z<#e|b3d9h)c5h=^buUCV%IKi5{CcuB~-qGW~
z?IqaI5oH&$d$F&pPQv0#FrFB?8Nfenh^ZOHkms{quA@5XTm-3T((2vySUiJlivDDK
z=-Fce`M)e{7*MwaHPMWGeC>TjWm3_=>65CeIme9>c|7HFXM#FS8qIF|{S?MF09c`}
zIAJaX^9aa;I~6P&#0=fIDj*0*I^ukoj}IQHm>254g{k;}?A3Tg_g-+{ZXPp>6>~>y
zg;!*|NcWz*_>w#N*PK6q$|5`PY74g298%r&3Xm~81l)nQBALLw9tZgK%G%iYcmXt?
z=*l(Rj&tfJDSR9qodOuRkjGAmNS1OaMwC5x)6(CrW!|0tLh<a`GfIh=wqtVfO`pyA
z`C)gPtx)-jbv+*Wk|n&es$wgyTomM@<Z{gY1L8?Otet?D?x89M0T^%(5^wCIwr6gi
zZpLg)pwCO@(51EBNr)kNT#cWhX?*F1dPVnE_J{{OX+iKCS(Y(Cduy@fTaN%<tOLr+
ztFkA|Z!A=Re+QkMp#okQO+d|-fDoqmRl0p<XgET>-qAV_Cev}hYlFE`zEDSWapCDt
zz>769Le2LZN2>RCd8_vK?sYFh2}U5mwPnxacQtA<;8Zue>>UcbVb9V}6#QgI*Ipm8
z1u5y>W4as=%bXDPXHME=34I%fv59@%nMo^H%(Rypi;}MP5oKQLpoH@OA>u=h7QL4s
z*0h^wBstd(-QmrEdTaeqakB9Kwv*#A9S@JHt!d!nJfP{d%)KJlXb%i%1oWzh?L%7`
z2JbeY<#jJ6x_##mgXs>0qh~GbAREnY0xT$eV?w+VIgr772K}5!_kBVD!w#uEm@;3<
zv7S?-Ivz*BNy&V)EOMst#6Y5K&9hFSVe4q-PyO`-=ni-_^IZ6);zuGe<X^5psn1bM
zOUtZku&#taauUwCgR9MA5|JdU2sw}?;5jm}vp3FB57HLKiMeXhQaFA4cw9zK4v`ad
ztK5ar=$ms1kb-rz3ZA*Z|Hh2pqChKfjol;`WMCjpHwbebf|3HA;5BshHc@VyUhleU
z6eb-d#OL&u7#c$T?vW4W#;O`l-M%m7=)rXR`8Hi`!aSUU85I@hO&R#wOO(z2rrK4K
zd%!NLflGD!mN2-N$XoorQBY8z3!F5>w=b}Ae#ix32PHJd%uKk0<Hy1R(}m7K0x@gP
z9a0u0kO{l}bhlJ14p%QQC`gMKjTxOADyQP$;ILMPs0xRWFxQ@|@RfY1Xq4tFZEhJE
znU}^fj~~C>L-`%hOEB8x2(avAH8mqu9HJN$zbp*ARoe|WR6IlXlhJvmk>BnOCGLQQ
z2Xw2wyc!j+2H~*`@Slk(D6}jE?b|-V4GdXS2ey0Hpmw%yvPWYaYa=QsD4@sfEG#VS
zTAZXV@h3R>MdV7)Did4?qatdWz+4d1X=`EO<48Jlk(CfpT9?7VdrAdB&83cLISo=x
z{27%`^mZvn%O_OAUY6^srdOpsoS8Nw1lw8=94EsH&)RZjoe0_l!nfv-C6g>qWK29G
zY1+p`33?*2B`>=|dzM}p|8RA3Vu-fz?re|B{m;CXfPR-i3nw5ELorwd?B4fYn9j<N
zcnIZGO%dU;wLS__Y^G3}5uTK8kBf_|(Gb%_aud?k*BX96I{|a(wfOyPzd~1r8htR`
zg%dD1Q5?{Y)|<>Xm!PXQ6B}FoEzilx$#C(y24XR-+^0{UzFc?>T^i)B1ej(v?x0Dv
zsH&;4we|c}6(?zx1hM`(4TlaB1hUgcow?r5Aa8q$v9C2=G-S7_m3c-w)F<i;OnW}(
z$wAr-Lw%s@yT#!)<csZ`u*KR!CaukJG8<`FA#TvRH>8~4kGdp)I!lecF%=>k#>qG-
z?4(`(mU;ULCBu!`adzCCpdV~)_qJ{CUvr0i(6!9_)+xvIeiRQv*@N~w-_h15MwM<C
zor2MLI50ozQBRZ>PeC(A$FYTDdV71@cYu_2HH1-2%wRs7Z*D8x*m?eo)_pcn0gIz@
zU?_4zdy9N38G}j`<d-4e8n;U0T7^aXLUy4#VkA{HUSukyNx;GtSd#@n+;9hzqZL{S
zY1B=~nleb}@Q}94<CyCoE7v|9Hv1yh#z(Bv0D89&+mudH#sVsGo8@{L7~9s_W&w+c
z{2Rd8wLYpe$+R&GIcnKmerd{bBg4F$kKuJUWBd<ymcZo<;bnMufH?mZ6~eWBAZhZM
zpJk55Z>%x8QT}+g5_M@%di}Z;0L&Z^>>@KimIh|tu3PBZFYolmqC<cB$_FwEIaS3Q
z?MWExi(m6RhJdfL16|&<!zv@oU~>>@VmzMTdC=}5o{w@6jj$z-7-9@SUTvV5c-@!F
z&(CkVE0c^RAe?FAeWMLI?#qdn`M}P7$uJ6x0$-Y=*KBtHcWo7Y(E^_*Y8gr|5el@j
z;_*~qrhUJj2j%fu>av*B(ld~;gXF49D~;#o=+SW-;^UoALzC(h(nw-%pZ$D}E`oVf
zSvn3eC36aHMDSg5ddX#2c4fC<P9S1qJdiV2y}_4O*j@%(&+JwoH?!m3&Z^p4=?#7Y
zJr)ZQ_s#q*T@UV!*?_1y?Fiu$wG94TPL*XwSD!Hk_3{uLePIWe?}V`=Vd9_RFGL?l
zypwz?1HN9s@Z+Zq*UC%eF~!6uVm9r?XAiV~@bSn4aPqiS!O26+*Ea`2QTTa@nAGPJ
zN^YG@#$CMnMe^83-;g`|aoOL}h9ZvcAr?(U-KCIxN-1olSQQsTN=5Y$ct@}DYkSPp
z8kribnus{;jRQ)WhM1lN4f@$bpWu^W^^vq}{`+4W)O%qQC87)&p{{k-wfqXpW(f@B
zT!z|!S1;_Tnqj6~cP^cDL_8CM*0yre%WtsUir|x9xEHa4P(hgGAZ#@a<bx{>aS9RT
z2B`JNI1gE8pnXhm3A7B=+%3RTBW-*(ykT~tkorrr#x>H%3KxV6Ri#eit{;=FJ%gK-
z));@@exERC9TeX6JD-HiZle&sUc~M(icGQqo=_@|TJ<Jh67~kzKVU$R0^9CFxM9Xv
z#huD0NjruCUsuoInTs;NT99^&O}IAoK3w*-QiHb+BSNh58i3};*q{$?<42*3o%hsk
zJu5DkiiY-K#|lMd17R+aaji~OiJ?<4Fy$`t=*O5X_T47Xv)Bmq#Vw4LUC@j0s<Zw0
zM3@<u;h1xOV<@Tc`bXQ3&${wx3L90UP|+mUTZ1FK4M_^>OzA5;k3lD$JRA>ui4WY8
za|3NK9mX4IELlrvcoH@AsV^7?!V+X%sb2n6h!{Fkc(YzJf>v$C;C;NnlLu+dj4j}+
z;NjZo{(zuIcK_zb<8GJ06j}}byn^h25ekK7V*(DRd40o{dcA+?qxbQ~JUeWA>*1`Q
zKcB`*%C}yEh>RAJap8soJ27X=;!0xFK$>EBi62gd!%ctKS5Xz=uE*p2`l*pEk0{Q>
z8K<j3<x-T#8=#(`n-7)ciEYolzV`43q<&8DR#pS`u{)IbYETYikguS>dJ3>Ir{?$g
zH^llKnMH4=HGWNm_BJ1MKM7bo3J$)~P^3i|uDm@5Q!wpf<R+_>otcNA`^)$1eD|DI
z8y!9<JAIYU1k)!{ujCYUMa$v>l+dMTL7m7^=zx-3;5HdP2|VHt(AC^}wlF`zABD~?
z%R+baD8T@Bx4eIM04T@7hqU#tHjyt;c;^RO28fFtEmaU?;LQwslVNCtw}~Am_^>^@
zJ3x4o5CGmwHY+M$5>i7iwS~*d1vO<-!$YSREyI~(CpuP|V@g_-C#6<_Gu`__p`6(w
z!aNnqJ6`fc#VlQP^3fHkQg(7`RK=B!NNoj+XWog#0}#WM(6LCSjmKgz`CNOO9!B7Z
z?`@iEsfB)vWG*{a2X`u7$e*uCGvkFb46P9)g1NKl1zEvL^;YcmSTP1Ro-lopQ@T{J
zPM@D30&y^lnH%Ac*MI>W((q8ByqPh`!2QnbvO`ZoOIzht5CP{rBDoAhP%2*xMMDAL
zPV%~nobM+w@!)*#@B3Za%rqu}oFhQjhan)pG0cMO*arS3JxS{3-S_Hr#7C+-46rC@
z)TzEw$-c;xh-z!btli(3Rgc4cJ+FNK(_z=oa06XFo7uASAa~=8V&2Sk=#N_;G+`;|
z=Z<Sr@>+Uf2fVHVkr&PSc5;nOF|X8~osqquUvF80UpJ@=X&T0Iy$M1pj>6Z{Rnx`k
zeJ2VV{g#xXxik!$FL2pgJt}6BvyBITQVU9jB~b6;Tc8yh>sEjibmPg8A=D9vr`Dv&
z+cH-RKCY+`J7-nuRb?L`YB0eQ4V1lCCYKgG_;t|4U&o_Ck71n8rQLjO)Pqs%jEYlb
z<K#L92E{(Z2LTIQJUq>fJZkaRJ-iaVZ$MaX=h4e8R)Qy7;e1=@47y2lfLxDhc!<Yg
ziZc?sj)}^|ZPN8SZly5f#(taP64kbt3d~7oNn`7Xt-5~GvBx19o<e0-$eD+-mgkg4
zBfWu5JrdW_$NC{0K3bmh{e_eEat$zWhtW7vN6Ub=Q2siTs0MhU-`fpX86sOoSxxe_
zZxQIhxR)F$EV&?G8vBOe<fr~z_AsWdDoazVk!c<Xze|un%49CL*a$0SVtk0jWB*kV
z=x(i<Fc^4pTZ=11Y2i+$VfmY7^L`S`lEz`AywAS9+N`1UX4M9SW4KbBCf*6%8s~W-
zd-S0;=tH%QoPe@_2xZ`FJ3XbZ?P35kBhNLe{o@nJf>eE4ZOn8%xpMQzn^JMkGF6ba
zS+4#l1p6%~6x%wnkWx0n`OQ?4<Lbar$2^|9F}(7qv8%f;jq5q6-9Ak}^F4eLqPF)i
zZTVZ504IH$W#-wd($P@mDDCt@K4M#AY;0`qL?R9DI&`(A9w-s9;J@n*)MDh@Y(gm4
zRe73JSBr;KyovRP{gt@n#`x2SeqH^<S<<Oqmvd$BssB0BTdb!}y}I^-F=);|^6TqO
ze>jq_x>l`)TkySCs)ZBf5wb)Erz}vQOv8dL0Fc;jSQEdVRai%$2tX8PqM3i?!v*+b
zvX0CcM?Uy)+p5hVFUzLXm_hZ0ifpOp%WqRTbqHA+%d-3U3%cs4pDG+5-I~p7fV!qG
zFIFy~u4dZFN$Kf5+ZtT*D<6Eg0_D!H)ay0v9HAH0?#4Knh09^HU71mzyn_Z+%!C#`
zM&DCS^Smle*-0#gN&233IW)HnbZ+M|VkI+wWhsi>z8p#`dTmE{;YNR{!znW}v#!Kv
zbX05`<&7HlJ<U)6+}(PLV<ka;zm&vI{4Xwmzo8lwO3ch`IDnsTc(wH-N>zx>wq+cU
z@AW|(jLHRQsIdi35v#kCo^v`Tc-F@AkR>~TqNJWlm+M7k)trHIg1se`QN(d?-+wPz
zCF7A*%@NJ%F~|N3@?ArKMB7&R_Zje7-vjr-dQLSoCi2j`BIqDU3>`7N+!c7!L5b_W
zx%I5>p1i^F!nBNxRN#ixGq=D@rw3>Z!;6^~L)OC8dUezg0KkRf<~MIZ?)!Hk<kd1h
zeB_wd;PG)~i0SL@x)-L(I+Zkv!RYl=@iWEo=Wjo^Yv@44Mi@$~7e5TqeV){54$_`^
z=yliU&lsd7KBNu){XqKU2Qe-%j_>hn8V){R8{LMoGc;7#MI7Pz+p<`cH2JPe2WDbs
zkoU<go4I#^BpH+pD}6fJa^b@ht^i*b>xWScGQ(3Y<j!^f+RVkk3~;${&}i1>R>FRM
zvlAsx#igsl7kl#(m&0-^*Y(XC)m^!AkLf9IM;cN>2q-NtAE46N*48FxZGC!z$%W&M
zFE)Jcor>in37&hd-Fb775knI875ZATPQElkwuhh`=HaHHQ@>^X_gp-^i51uWcgzwE
z4GmWb<PcFCs_?Faj3+;&^PzdgHVKFU9~}wXZX@mKo`s^Bfw#QK`Rg|F;lsUs%#Sfh
zB`=$Yli%X=WTCyL5d(Y*KBc$x0Wunb)SsP}5F6fg9fp-ncbT}I&);z4Ziyea^rn!r
zxSk34hE#2CR$D#qfhTIU&zcwgEhDDjjK${`MSHpz@b8Ou#T)N46^c$m2Rv&gh{xUo
zH1jAwpD&2)-sb2-##tM7(nfi{%cUh}EF!i|q0AtKjuE4=EA#TakKl#%(?UYJwDd5(
zgdpfX7zcUX4(PK1&}3#+_RvqJwkOOz$70%BmaWbRO51@xbk2v!`;sH4oZ}-yDJh@2
zpU?f#D6x7d`_VGQeSw-~iD}LOdQuO(g_{M5KzK`mO6>ZdP`Z7u1UjUKtQ{}xTl?}o
z>x?v|A@`|M<FaTEuJa2)#S5?)SYUf%wI7%#d#z*ptT;1zw6&a9Nn==Gvgj4wS7M4i
zaqnXK2~jzwr0J5nh~J=WrAb<qC>Fq9<?C(<!jMMD%^iG=n(dlG^^Y}a;aQo5X{bWe
zdJ@&#+^qR7*gb+*7V~&NK_`5Q!<W+277;^aFUl|65?*+<f~8zN7}I94@rWkEG865`
zHegzMyWvdbVkibSfK~)kj@;foKuijO656b`5B(;0Ue0BO%Wgrp4?z@Wb{B6psXp=Y
z@>;Dg2AoMtP*70kK57R7v;8`C7m+UoM57cB`V%Mcgr(-7neBx<`qGw3AopsOTcrUJ
zNamB;7wSM}M)=hoA5@LRQ~UmX_XU2gFW>~ERGu3TC`O+2$=MGYD0c#di2hvNs;x&g
z0mFYZ;Qy|grRBSrnxGq@RqaW=oT(g&lu!@<r5|5E66UcK5du;<k|VwG%#Pn;`C0mU
z<|7cM1$(vU0=Ur9RzTGRdT0Rqy;_A#AE8PmqetY`nbTnvr;{QKnf#G=fIa=f_|3oy
zap^!Ra-q-PqDCpyZW$n;w##n4ehV>e9rdBKv@{b?ER<pEg{Lec47oZsAWIb@T?Ry0
zJ8-p9p;GNTVJuV_n*q8JUT&EtbLo<lQvu>XPAR{C0^XJluUu+VC8s9K4;ScB&{c5F
zUZ)vKi5O?uXDv#s<mz{TX_zR-+3>uH2I3ZB+(;I72m<+7#3Bk#hFV0`sMeCW@+t>0
zGX~9UwJb182beEzy{@tX)8x{VsMebYdUQ9J+Y0z`>NOm&hpNj3{7_MPiD7Gr#@CYT
zRmTgV6yY-`2)m1nZ5OV6n7{XFjt1Cr-aEdpt*UOirJSG>`vtGspt{KO#-Y&O_htBp
zCF~0K(mm)x&`)6m<cLf2)_ml`_SV3Hs{b16&lSqv6!T++)dK>>5JoTeQE})UELZj(
z=nOZ+jwp$QZ(AxmH$HW;hB$E^Sn(-N^*g{6(QaT9_5ASV$5TFOW*vX7jw!Hc_`+i?
z(JK%XNco%>Hp;e-7*c{+FLBGm(OFwCI_M2v9Wb-+8lCg!dXw!dJ_{yvs?1(H0rA;>
zo*ThI;EN054dfY65(Z*Er^NZRcas0;wJX98v)zbN`M&M&9xgh+oW}iNL`U1DHNRz7
z8s*p@Zzt}jKkE6Gv9=77vxR5B^9_ZGi<RL+<n}a==Pbe_pFRx$bCa1k&jF=z)`C!D
zfAY%NenTcXB}4lr*6$m+IC>M;a{xU#{;=3>&AOQReFW1+D+2IYSy{76X@=GSG>@Hc
z9uBW-Zvr4WZ6T;18jvm<k-oPojFRK=i{WGA;Aon2Fr+PPl(v{^1Kn2uV^LS@SwvMW
z-i8Fv0(r2Yc?(MN%mI7~Pknc+{OPff$fzhg0C1#h-)wHWyuJ?rsbcm39w2k&MR89b
zBC&A6*mOD2L#P7rs*eA7ikB1iPIF*H(WFKY*{|ejD_u;=&@)W_qgIRpH$Mqq{!JUb
z;i_MDTvVEJh>P`<*>6fjD-}CCdkqwIcYW3dcr<*x2~Bp`h`5nk^r^A220+MeLlOJ^
zKwnP4Jg+(D>fW<4HBGkju>YY4e8et(3vX}lp64qL^vGdM#c#V&j+g-hkA5CPi~NF6
zv`>!XFF8i$F$=bh<sj00{r;m$6|U<eoqOBlw@w)Ae#4WY-x`T4n^9><VM&Wtnb&JV
zKKI~k%kTwcRnIQCpGNRCH*&B#tnIS@0o_45uB0B4j<){rEtP#l8uK?P<?Pw@_f17Q
z8kEGD_yqfrG%PkJ&Ek{}7)tangGy&iNdc704~&yW8434ONXXP{5F7fY6p(;I;XQUA
zLvqlMr&0Q6A?7Khkmay21JzRMLsxm5xdRp-&3)N$vrt}eyAFv`oYsNlvbI}GLV+pz
z<+B=dy{hnBC!c~cHY9as&xGXBUi=k17&O4w-K{8zJD~bn!LJCVHoSr4jTR<uc|BA9
z(ulB@svw>BSyPTOpCF;1XPLr0Lw>Q<^mQCMw=Ce9_pyaN9N;mVM(|(=>i$l@)1Vma
z>B*}o;B7|<${}9<Q}P&LxH5G6M}S0gM!Z_65c`Z~!jC?e!L<=!OUU<~I|3A{OCIjL
zh-4^&tUrdp`DHiw2BT^j8V0oYVF(u8B(*&x0>_-v={H$~c1H*$qx3Gmfx1jCTwqzq
z>?G&}(_3}VhSfgBcf{2;Z;+3(7CEwn-#)ohB*Nj(rEe%4T7DXBy2Ssu>DZ)i3H&@2
z&MQ8Mvdiih3Ja}^Ckg|muKms7u~xbKJ4ey>dDXN1196~qE<(}hj4=rzISN}{mLErn
zJ_2b~A<oA2gRMq{W-jwmJcbo(gUqYhz-a(>ZpS>VlmGwII;CFkUHX{pY?}hw=cupi
zT`u>MMG&Or%5xij$In_U`)PfnL0JtUl;ancc=N&w;q2~$zxLs;1fG{Dy2`|^bDR@h
zRW0#hreDeV5RsVIpAHs3Z3jb2kSOJ%%cifkehK3N9B~!?0bc}uZ4rGuiFZxy1AwBZ
zBCJ-Es}OSiw%0iCwN+zE?~}<2^uChcu^Jh+GCXzZK63OV(Q1zVTFviV*Y-z?<As|a
zH62pW^Jx&4!bf{jGf5Hs>cxX!MZ_bxJRodK$U2D^4~n1Fga5wniepF5L>fy-xxpXQ
z03x5D=gd)!#B;637GR^l$GeUQcjtQ^r535dAMP7vBc3BL?;zPWmgUge%~%u9)t&s&
z3caE|2|kv9=fp!)EbfJQ9=5mO(W55<0C&oo<JXTOM`H;Q6^N2nBgDb|s5=$?EDDv(
znN-71w?Fm$Dx@m<%QN|c&O)5G=b(l=2t>u*u3LEOJJ6~$X`9`Tpal{@LhAEKie2Nu
z7Mn!Db7G(BDjcv)uI9&!I-5a68+z^<@J?ztydk1VhK>}fYwzmU(dycQiaAVP!N0kh
zu8T<dUSUWi3b3c{9wfLBvPQ3ei~jj(Y03u5Z5U1Dn9Yc;t}4EK%^8;^lZ<WCqi^8q
z6$}NQ<1j%)tb0YeG3Y@=@F2TgGpZ>X>Ajgs_K#v-V{zj+p^BGW+?jn@1*Ftnd{l_I
zIJqzBDMVz=F4E)dC-KF8aaBt1IQx^Gms9IKF3vGOW6AKh$5gh**?Hv@>~fz!q~9SX
zCY|YuYi=b^%Fa$3GsHjU?J1exy%$&*HZuLyVRVW<VCzDHy2OB{VvxN?sQtsQvymw@
z<K~zTm@y<IuF}n_({@%r&~hAU+~)QaoLCIdmzMCv!jg+y4fuGUGrZK9qju~hQ|bI|
z!jQy}({Oc{RN6s*Q#NR8Nks!e9m)%)bdMsgM;|u?m$X2^*D2QpF2JX$D~EqOKU|2?
z-|~yne+}fM<Xc@hZ<gAv=V8n@t(YCY+6TqgcP$3{$An0_M=8B>U-C}RGwt<>$WqP0
zUzGl$a;NlKQg>I&7SWwcTkC$`Z*beV9z8cOq^F|Rvu}=fmdLCwP2k+;n)d5IXdFLe
zD%{z1^TJb36R2Fb?3%r{YrJLi^d6_`?d{`^A`-q`v&OGye*VmGGVSid4|ZggoEEWo
zV0f`<j4CpCROW5QTSkEwxhic=X&vt<?wC%6sSR`q@UbZRWfl<JV=9oPXAvk*X?E&=
zxL5z>Ew8rY<M-_$Q(e3tMSQg`4<uKl;mp(jq*(b%K-qw_+_!q|VRV}d*7$w7fKMdc
zgMnnl317aE^sO_*D&>=Lci`_`rPn1f{xlAHa5G!0sr9yXOgl3+F77#2=dixGNL5s|
z%8Vt~EimBA7?cxPrPJ2Q*s{EqXudi{^wLXCDl#eH#Ch)81>=N<y}HY-nFH(9Um241
z?!<mkV!cD4cUn#I+heCyWh^<iXYsMc;j$;@f6mL!ZLeC&u)IhaH|Ol)VW!-M^Xa=&
zPdSVL=z?=*zf4ojakQXm;ofM*AP0|IDi})0lG&gup5%3~*h=c7(UqY?_;RT`o(<dz
zckH=zZz($a)@3lTWU+xTGD+MQRK5L^?nMKwEK7Rw?t6R^ayrjp2H2&yJZ?7gP2rQu
z!yMUe^HOn5YI5V84JjAf7|(08EbF!yEBMi*pK6G0DlpZwPBRQTzu|uEGQDQgWyYXH
z&qF>;q-`gyN@-@DUl6CWVoSQ#@n82RHGF&ci^PxD{i%DmaP<^}R`cx@Z<0?XNH(*!
zC4HVuz2#(9hr<5N^V7)5Hh<Opi)!U{7JeA_*&&x(nsREV?>S~PlK7!t+7|r-8c+U-
z09Ik^CiPmf6RD<)pXIH8(l^EwK2cJ*$hOX%;Comp<G4g%Rwo^mc*Rl$ui4p+ybm++
z)tPP@#RE(Qb!(wSU5B~DcFz+1oGI^$iqaqUYSh0i#)o%byd#RS<y+g<*rq=(F-=53
zuG;NOlIkTYjWYvOCc6*czAPi)p$lZbIVFjC^5?w{$G&T^VY^?p8)VFr^s2YXtOY~L
z183|^lMB|`=QDFC=~=zyTO-!`mWQGmFI4sG=8amu|Gd;OI#nEHc;}%`b;h0;9~b2s
z?+J@Hi-%jj*5oE8Ty{R;NK--{aqepA&9buj7{(x*n4XMN^&kh6R{x_MgjP~xXKa|Q
zoit=H&Q*Mr-F;%7Zu8xXpm~CZL&p;a=gM?6-nKA@68Pr0tJYj*R2p99-ke=iZ5~K4
z<kIvUZ*U%+h+e#MwbXo~uPm&47c%iYu0W>GEAecM4ZOe@WIAXs*y$g(SOa_dN+L3~
zt4yLC-(%04P$lPwvbc6d<O?T4h4wF73u-jq8)8mEFgn@&_V$9r-X=3N!1M9-#Tq!_
za*-S2C2mYmVuO8wn5}?*+FfcZeFG=I3dMc8KuxH1{#6c!ijnN*>Nck;Z$=bY6<Bj^
z@Z9w<%jq|-=33ipE7xH*U({SHFmNBTY;=R_BGL2IZ&o-k)cm-vR(}smofX%l>Nft-
zb>8<wS3y&y&>~B8_299|bgQn`C^}|B28YEnJF6avfz=Wk=j=Xr4$u%X-8$iW_z4ZJ
z-UR=t^Y<U9tS+1r*VwaZ;SrQ;S-+F0UT!o|O=Os`qeCm#XjTqtcp%5U_!}S&OF(i>
z2I|7;TM_h(jKlYjlET*wYr}Vz7EoZvAuKyM!QuAtLZPXK9`bwuMEn*HtR>!4a7@hh
zgBZ(oCvvf*Z@A63yzHIVC>K%ux{Tk`R;o}O!5qaiy?tTq#G1f5BcxSgIyEt+Q%Go@
z+|t=k$mMEaJnr<fF=pDEQDf>@Cc}NA<`$(<dg9>rk|XtP6MK)c4|_6hOdo+Qki9aG
zPyB39iZ;d~^QrzbaizUA&z?Q4M)4gYpIS4L=yvL56&`O-lb2UrCaNMTqC~?zt&ifJ
zU4O+qGNH>|8Y>;Dbk^B7F5P<~@e4fm731Rb!CA7X4hs8+VeX%8?lfv@>jwuLdNS;H
z@jNEK##pxC7m)c?mi98MY-mMP>SCI%Ta2X*CtI=C4~omxXL;sy3NF8oHWOM={Lc2K
z0wZT}i+Xa>Qg>O=q46G5dUrfMV`jH!w$mx+lgSmz-BGq3pJ|!2>9J})hszF^r;oQ)
zbyA71oHRReS9mJIlGV?ew)^Y87lUtIx-Lz}%#i2U*Nwc(iM=yB#F}}YpQ1-!x9%fl
zqLt6K1Q@e9Ku7h~K&7b9PnUj3*z^3}1c1At1;7SEfy_uS9In-mxSK8iKU~jk<KX+A
zRmX%^lmdN^4n15f-VfSdv7a;EI+0>stZnSlVmb1nj)Lmz6y1QYG^bUc#KkDewR_?-
z?bWr$`=g)azgDTQb&W+ktG^zKp9(f_>eFZEXuc?8kAK(0{oDk5R><nY#l0rlsSrUv
zfg?8r+a%3`0(qZv#yn{G`k^&3A;+aBXOW3NC+vHRZAQqA`~8kA&aPdHdnI2YMsG#n
z53<S-Tm7^j(tIRu9r{gBRR=>>OZHp!|I^-k2U7j_{o|!YL==*hl})ljMr4z02Z!Rw
z-i|#h6gf88*?S$?tCB6UH)T6k#xaiddmVl5&vjq-b$#yZy6(Sze|-Bxbn<?`#^bpj
z&*$s;vivNUICiq_MxJ}cpZUYQ^#;SkQVK04g?!gj$@E@F4e6)$2CUqzpGo@%J?_%A
z3pMwsOj1WVcPg#!d*=*>ne1`*7@y9t=HYutH8*C#A5XmvI#lYjNq(!lwxQKHRd085
zmpd-cn|in^R@kOTcXOb5Z)hFHL+!GgTCQ{G%sopB-PzgQc0L|)*sM=|wS`pUNEnf;
zHzb*8FQRoL*RcYjm{CF`_i!E+^1Uu7gQC0;3o;uX*KPo5ih21*RqOBgZ&6<ln8brp
zeh^3j_onu5?#;-lQGs`T*`|Y~$Ejvxp^Bq)n}fx;%E-Mv0X<st3WLKya(aPz`r5-1
zW|;X&ir^iziL8RAG}ram;IHiD{bABLzQ(8CL5WSLITpxmKtT(vUcLWO*~WuXcj94L
z2lf4-yPUN)CrMrx971hAY7IV!8KBHcb#85VvC>JHe2sSMCGRB)vct$EN(cM8i(YCR
z{v7irOcsG1r9KOqLIR^EH*SVMBlya3iKWxP;vH~8ooPv9yzH(GbyqzOp}X?{XEJ3x
z-8&}2#fb7Gez(zwX?-z}>(%Ka*O0rK2yJ}2&gAKq=hG?Nxo&Mmse~8U5TtyZpq!)6
zW?V3oVx#0I^%H?(IIosbn^8(E_mDTZ)2qZ`ULjw-7#w=%vC9Pq!3Htw1;D-Ff&TmY
zsdVY!VpR0a+<<bfIwEtydJ`XJE!};EHhQ{8ZsvG%FE^Ilsa1<t&Xw0nBg#8Q>Tv^4
z+^?8!Jz?b((vn}t$@O%KH*ced&#e<3&>J?qdt%jl1$jdT@k03H3}^Yu*Qd94sM5IX
zTs~SIE~rvLL}dAiu7=9KN|qY5N%*l8Z$Yl`eBpfJ80N+<x0~iSSp_*KYbVNzgE~9e
zg{rMV0T$M-q9yLM%kldBgr(PN8CdwpAXl_!KOu*s-<p@+M6Mpd33fT-VFkKnYil`^
z%je-o@l+-;4J$6-6OOqw=YAtDz-F=_DWw%OlAM<wl+19^>2*{bgwD3b?`aOkudPc4
z>%B<ow^|z~m%_Qiiso`BP%h5JW#%BYC4|h?k5{@zIK&z%ej3L5d=1Mci^{E2VN70!
zI0dJ_$Ocu6f>(NhQMCGLN_`HvJ-2mkV7u;?8Z}?F9PB=}HR;c4!FCVoE~+TX0rf&W
z(*N$xOb6TFJ>`o*6x&)2%%Si|WmT%Hs_0?iv;GbU8#@bFFMKRN?8}tV$`dlFMz&v7
zMqO^QuI*3Wz&D`A1Q%{Y#jdi(K6LE1Q<5S(kZ>+)>s1dg@19C9HnVd@*N&X5JwxkN
z6J>4Ru%xb$)3=IkOuSmHCxtOT{e|;*cm)<CkFeI`trvm0yp0-i@p}0+W;jLNW&G&T
z1LVR5w5fwU<`R)-==YJbjku#~DoQ3<4?yZR>$j<el@j^XJE!vCT6s3(o7ruaY`4TX
z&KVY@c+CpcdUk=Do10?@TDCFf%IZs;wkg*AD|bGRZoLf|EVWY`6W#9@+9PaFp*~rG
zTAdadiN5?YW7o?oG7zp46O=7UCPY^9Bv!VYR?Ce7xSpEV-^0xcua;!A9gBR!D9I#s
z6D1@Cfts)h=`z${1!$h8nBDs30^8T;rCiSdh3Vjk-pi>ws-s;vnMkhb%VtN_rV)9x
z&IFlS)Q5sfUj{+JVN541RR*-Xf9I$8G;IE`iOSnIE&zC^{jh)2Pus1(RoyBmg}~>H
zsyIEqjrXmnr1%D<MJ+w7^e2x^KDL}FRWh4-D5vC${wTWNv4xriFIPKl&C4)Jz0jep
zZ!Z3Aj!*BMU`M@eEtNScgKV)%lfBo~2w3IIRkHAi0nFWxHM31@oTqMk7O>UAk@J0X
zd7C-Yvv|f{qxJNkljvTX!x!}#%WHbAe2}j!n&)C#M6|HWo*i5#t~QV)`051ddiu`X
zmmdAipMz4o;RbW1XE7#!+PGxwxv^q+dPjvPg!umJ7JiA!)P!!zKq_Kp*>DQyfVinq
zqjieps}QTW;#7psGu*jGeESftijl4<P|sK=-7|b|cX+ec#6p`S7=B)=&8Ysa<<n0*
z=Y@m|K%e@hp+d@thzP2cuiaYU0RF{8L-@Zh2y}ZdjzqUtH^q}^^!$KVzq;}@V65RL
zQiG71ZJSE(l%m4(<O9ao(d7rW7$EAsXPn`n4K5x5<AX5z^D41)E7m&Uj`?l=d%%PK
z@MFi|H8a2LV<U@bueM_r+0*2F6km-eBk+fZbWXo!9gOg8BnNE{n7!<NHLO{!_z3YZ
z@`rp-by)LG+5uT*LvTF##B+Nq2mO&CdCqsW5U{rSSq8pG>a`lQ--;4m)<7|%bLH!k
z`5?rP%NzgIxA^@RlK92A^~p}-gN9Q22ayc3!aJw^MyI;C5ZJHtbm2Mu%a6ba_!ymX
z^;fS!GLHN3SX<J!3kvg<@#h$#3Fs7cb#EUX9laBWVX_(pkH#ia#?`ekrP&Rl@X^XO
z%NOk(iF??Fhu~VK+7lU&mjTs#U!n&F@=gzJwOa+WUbGo8Xk=RaaVY2kmPn=$T^zd)
zv+oWqWPJu2nBY%eJ7cb}1Y2_hdHNl%&>E4Zr$8W$qfd<{8>V`Ob~D2=(NJ23()nv-
zxk?-9_gt9Yn70{q+G{!Z)vhMri(2nlm$cJRmW|Epa3*Pit?MD`eUu&p2O-a9{?jAG
zIgyVRS(pN?!&<spp**%M@)m8NUvHIV&Qjh==se~|2500U&8O-glrB?h0!@O1=|kHP
zH(0IL4{q}JVRXvXd{nY3G+p{X^$<Y=tt50jrOMX&T#ka!j@|1M%lk_Xj|Yn$+|GB{
z)M%rZb>gqF%7`P5&Y%6F5bPN4;vjN=k1MdiFZC<`x<80>v(|C%b0S_pKWAI|U$p=n
z4s(fAn^{B4@v#G39Aqmvb&<_A;l;zQpxUq0Dc9OX8NNEAdI_|fciLp31b)jtaQXoe
zanlZ4U&ia+U2f7glSr$l{wV^um<^P_Su!mwmB%e(ZcjrNDlZ_AhcaV6bQi0v;P&UB
zzQXlLBWahc`pL%0jnf97B#eH8XDs(m<zVx+z8^EGq%O4@njM{`Oa8Z881m-mUGQuZ
zSxF?@js*eBl6J?d5(7EL>=?jfgUoZOkXA=?T!|A$NqG@{E9QJC(VC7_IlD4fD`^%D
zA(+tO0}*lYo!gN%o3{j-q?AI9^FjF?@e3aWA|EtXA8}V2Og3yanoiT!pv2Rj^CnTu
ztgE}3AYM=F5EvXaMtzue;4^hkZxRHSW>%t39v1n7+H*>eE?0Q-pUbW|c~)S#E+;Gt
z>M@MR=MdD@mVwRKQ;I7x4F<-bQGgB-_EtLJu5$dX&<nwDOHUE^UG@R^lt9w(>@?HA
z;rDS+=ykfrxd8e?lFQl!ZvQvWFFRo9<fs1|<?6O|vDNCx%$4sUWG+H&uh#G&YPPP|
z;KHIXcpZE8N3HX3ZknzSwof#wP5RIe1>E^E(R7US@~MHlj2BoTh+w{81}$p^V)_@M
z79r%mGxrzq4eeDvJcN?u-r}xK`MqnoY=IonF2_5!8i*bdn{Hg@8Lob<z<xI~mF4iw
zplL$>omw$2cW3fB<UvRCxD;8FtDujwIKp;c?RZH1-p$G6v9zJ4ryvA%KDtNX5QqTF
zx-Pp-IqlRoJbW9()8(X<J`LC=RG=w(kNNwsuxA8X;9TMA#=tjDK;~S=dJo{x+P`E{
zR(*IFtDYL>U3g>bz{<6xSBn36%E4@q<7V<y@Q94)^@Kj?n!>%C0UW0MV2$EW7vTfz
z`@VyjIs&Pwn=}!-M4S><P3v0Y8bu|xy^N~|=p!hvxAzIIqS@=V_(Edno+Kk*<82Zc
z6=d5FHB%SXJhx6Vx~W=e>6339KHJTBPKX;eT6^=f&ZFFd_9j^%4Fi$x+R87~Udiz|
zvqn8A(2P6k2UUR$rJ>kZo#kOehgiSM(Mz<W4H7ev$HrH?t~s!_)#j%8kp%gE>X0GQ
z%yEM}r?<`sn(=@Hxq7r|PNlyYV_}%REhSff!k_)lD-%>?8$!%m7lkZ`I5<GD2{!zD
zmI)PEbQTmAyFIRZA6EJ;*8$5;{F_m`J#5X&>M(0Y?BanoN>rAnU@Hw?rozJA8r06I
z`rb{#D>2NSX>3Z{{-E-v_G@tT;rWVS=omH@EL?y8YN>{Lb-hVl1Okl8<q$4f&3V^O
zg{#2RF019Kd3Zqd#gebw6481R7szQSW@U8)l3zXsy`x3kZjOt4RXn?q)9~SI5UJ#y
zC~?)>IvtE2iYmE?4s2{*1a5ExYBQG1xOW@RVo_;6Od7$W$B$l(eMK(rb5wUI!REsC
zDEy*~TGwPd(#+{$heGh+Vug-%ybIA2tN3D6D`10x{W6bS`^mon*aI0&LHg=T!r~2>
zI%i!#j_5oZwbn2&*cBhvep);(G3yE73XJO=MB%9tnr<VsPL^W`@R&M+Tj<-^erDia
z!2&KM(Y0%X7q4I=xwrqmL<7<=r|QA6&z{Bu2bj)61F6nlH)%^?=llqNUlISw6e(Ts
zF|Q*OA8<Z^lJ+O7?8<YXN2%><kLL(GywmOp4=lJ$94(^HdrUt^1t~t!8B+ejf;YiI
zsva6;ppvS(u^6K1!cFCWteBNI&0JnX?9ij4DUA?{obArgAcU~_R8JqgEq^0tVPN8u
zbv~>bEaeGXSHTFe<MYO8O*^C9w#<!ALkkN66WwVv4~gE?v;JWC&PMeJ27S(Wa+m1z
z<w%sx;>SUSGoP28x$^$O;W4X4h9=eQ-RAHxljB6n@P=9KK?B6q$fu!y=);0kQNZ^i
z3Ms&1{hJh77?c2+@JK;Ij<JSt)DARa)<m<3u%09jQVIp0st!sTp+1b4aQ@mRx+h@V
z=AiXGdG?E>F)FR|`!!92-3Fgc*y>5<OrM=B!h@Z)>r0<K`d#hKf=fi;nF!{{bCL?L
zl2y_+qq%Q0zgJy;Qw+#a9uL~Lsh`97H;l)RS_f^}139)#SCkO6kx)@&ZL3<ihmlsp
z*icn^2e8r3Elh1<RWfi@A%cR5Vk%RnR$a|^oHCPmv$VI2`VH^ny@^q_x_AIla2Ox|
zzE!c;W1b%!HosI8T8hs1ZYg!4Z6N<l{I+FUgaldOme1@!Sgh{#v3^65F~h(lGQU_G
zu;eSYzG90#fGULp^NP*p{!7XnP^Ha6ttAeqODvonn$&HZG7f$<p6#}U?`aMh;FN|u
zmjU7H{)883K%rGUNQS0nY-=a^K9W~z{dTF=aG&r((nt_sLHN8UtVdn|Pq?IN_3*?K
z&%>M2^r-Gc`qg*fq_<3WN^b&u7}YUk{WM`1sOMY=q8cw(@TVsgQ3O6=pw_pqhz<5B
zW<22<(Q<U+p|{c@e|@O*$pl!ogZ|T*{g$Bg)j1Ykp}sCP$g#WW9L;orSWK<`9D>TD
zqr#S^IlSCa<Ho7)GyR<|!i&<0qGQVVpl*{7<1sXk>ZW3K5D$G_lGO~H@0Y@OyNo{~
zjXDXLJSNckfL>149(Gy8(#z%lHoe?<%OUQdJEeOn_&Sy54do)L;0%Gv_mlE;BH9_=
z3k~xvfp;sa2n*LSw;Uif=Fb7rn!u|YqGGAZqeIIhe5DaB5sK}ZubKH|vRk!ND*Mw`
zmCl2he(-_s>lfO0YNke?X4Vy43~n8Gy($;cm$PS<v6DAauUf6)EjzdUhllYX8})N=
z!|1T`)nLk{i>HPjE;!UC@ICvZ6fYt=iIFc2^0*>6IO~@xYxGDB^V8G{Yojd&E7o-_
zr3-7*1}zHJ@?+SFq=_$%2CrI<R5CJ#`LDY5)U(bq#>u}}=Z$pR>e@@5RlVOcDYo|$
z_h5c1f$<fWcj45g8ft~@+3156XKmcnh6PqwZV^XxbI&_6I4qhC)sXTiwyxLArR!%>
zu*tk0xuS`DvL#2dWK4bMbPW8o&OkgJQFq(HZF2Jdwcs(6RA)UZ&Sb>lmQL?laG*0s
zgb8;F3Zu}Jz8{Yd$wS=i=)Tlbi*jibmL2dCy4&of{lrsOkXp-)v%GxF<CJJ4ShsUa
zsumjh0(unhblY9NKJO%egRM&Kd<+{~8@0=#dnBvB3AJQQiN15IN_<$e)muDB7bxyj
zJpQh>{F$mgUF+CR(n>*AUTLGyM}qEdX_5*wUz~POBYMS@fg%04k-0b9z_m@oH>_Es
z+g((WY$f0$xU}=UB-s@H#p@F1&WGZ><`^<~XKg6XrPqkrTw)&7wQ>l^*b{=x^n1uV
z=_8{StYac3&}j{Ro;x(;=)@XEul@2eN@zrp7z6CU9(r^QvSWx$H_9B}Psp&|xWRNc
zH%V;WVtB9k&6s~$kZ`>9NE8IK9gw*t=QqV~_i^WR`9_4>M5v2@PI0ZF7A-MF;UVh&
z*44L)ol$)g%4${C4L7ZhwYG-5(h^iZgTu5X8YKezrzk|cWAxZqjd26wvK=vTAAE;D
z4JUbDU^2t3=jv$v9Jf0SiiSdSK-Dl=k_<ii`I{qbo^r7o!Kb+T-SB>^0k?HcXiP_h
zCu`e*mUFj{))m2Z<82UBM$L7&gra+`b8TIzd^Vgz8_fl}YLuP1<5R`eMLb$2OLZCP
zzidoD3r?nCWvZrhs(-y>aDP{u9ZCmtqzgXu;?MIQk}g)r{uy~p^D6HL4Kl~=u{5)*
zm2CBZmUdOaB(2&;h`Y~iG9M|j`8-d}W#Jcj1P`*;327C!!ODN5+<d%p;|`2Rk5`h6
zw^CwQO}xwD8Cn);x=e}o{+07w92ICXux<eVXsMoAei|qAy}{WsG>plKU9yct*u&kq
z;v8LY*>LPAo$ooE7Vy9pr=lQ~D+8RUxz{#La5!zOJgs4KeD9rX{YjL8<te^#4p)Qr
zPc4&Cy5ULE<RMA_tEe=0TNmAZ5@Zhbjo=h&c9TG7{>q6ylYwaAA38@9_L$N4U!CH9
zC2$s%%$Z$4Emdp!Byg~4Y-!OOlRSprZQMOJT%u-~k|fg<clpKv;oY_vMo_9)SFDJu
zj_nFT1bo~>Pe#<+(i)1KW0%67K5m(6kR%!3%O6g}1hgNC?}X<eo4}Dtnp<$%ck-h=
z8x*5QqxD63QpHD42OenRLE2^(AE({uV|47&IQ~XQ`h}kZNXWV@0bwnQL4ITJhPdp_
zhR_?x*BfIZZVRtDUgRsb87+{t=>}H#aOY3gOLh=#Y*vp7h_-b6Xtxa><5|!BxGT<k
zNYJZ5%Lv1p{6dU;-JP7A#3|7>vx+ux#=Ya~TM8DH2U;J5DtcYtIYk1e4ZxVYt4OaE
z$!&<MVw7qOBko@o7_~oJPDwISgWU8gffJATRYP>cN;j2%p|h_Om1nBhnBv;%FD*|z
zhz1lM?$zx>6FwPiNDhDf-b3}#lLVU9<%T|YL_aO(k$S}nZZVyB3hP>^P+9Nwf%Pb8
zut7NW_No3x8*6zaeXf_!vd?;dU&Zj|{f*w_Zvh;6n=|$KjEgI+jJ9j7XlLEWt(kIx
ztJ~;fevjA&R>~KXMXfWOuu-u+`O&RrFL+Pl!IgT)*7{YwgXEk$Kg>37e%;526sGO_
zdhWUU#F`)anJ&*xYm^$dk&s5Rl;$Qea+L>v&2%C|fBVqQz%?xLx{lNbh}9DoqVjYW
zHZBOF(bOZIVcoO*TI(V%<cs}alDMfPSsDLb7>^^q1}EAyfivZ_VJanVp!Au!D6^cT
zyUJU|WyblyJq7a<vUa^g+gvto2YBJh#6u(0BIb(b?U<nx(*=oxv2vj7z83-(WNCGI
zbQB=E+raQe;5i)JTlm+JxBA@*a~deXB@c>VTVsME9==svV|%-HsNkvV;-wVL4P?Z>
z6p%OM7YtME8}}ajbD-X^Xb$*qH6Lopx1Yc<w~FTw5V4evHOpy~5NQc@`!s##fgW10
zF;c{2WfpmRs%%yz0Y$med^n799$Y6yhaA3tU2(#vx&eVNxV8JCF$v@-L_`Jx;vJ%L
zi5CBaPi_DK{uhF$l8c8XI64T>Y_j9s2R-Q#yo;(7-ruau#}8XQj0$$B+iFf;b5vWJ
zw&yiMvud?}#$2stX#kVjw`q<nlm!afst_0Gb{tj3<2!vdMBWeGH7A3IEoU~EjK90l
zoLUt)6WAskzRsB}lvPw1Ccf{0S`1m&XPszxN@#;o5Rd6I$;mk!Vc8|V17NLt{jXsC
zlzpMgO=%<IDU;(Wyzg|C`PG==2TH@uu{g{V63u~4E$YijluKOtYi{_EmqX9r3)F4S
z8g>@>bE8H>YrGK|&XkigB!;H~gXIj1`yMW?M{l+GbE0jNi#IZbe!6(XPpJ0I+}Jk^
zM6D;y7bj==Wa1R_RC4SJN2U8^Uj5agtT&6UihBC3dHt9KIpXDiE9EVawzu)Pbir{6
zgSr-hmBL2orP3NyplE>MajA;<e%iXI#vUyV=W$OmSDdlWbGY1q?W6OOF=StDzLWkH
zPA-+F)i6j==YLQrR$mV{$R3)E>(%XuAV+}mQ1%_WK*Pgau^BcU|1c&UBzjzPF*~23
z$0CnCqcbOe?)`R(WgOykI2g0gHE~54Wta<<WERy}vKM?JzkV6AJ0tHd*t?X_Xxqz!
z8G}q=ROSOKy|ZXlR`?&u=pV+l=FJB>cfJ%iS%yONT{rS=j)91DP2`x^0;^kwrUN2U
zrfb=3X)6DDsJbR@WR8HsaYe0IobhLFF)43%*vmvWO!^;I=6Ec`5`HvLc~_#%=v$A1
zwImt4`+)SP+H-XHwpYMq>At?Q;GrH3x7K}9t*ph%wV@EUZQ5m4)O21>dS-m>tCkM7
z>qwp6?c((SpGme_^2RKeKq^7&@~?N{zlzV}tB@~r#m@6S8Rq=TSP?n9F*%jPkM2h*
zCZ~J|wd<_e=U?XAA;A~DT%0G|p+s$-r69BG79o0I9h#KuVK?CQ040_>>y3W(H5-mU
z^?_>aK3%XBxj<L8beqvG&UNJbYDjM8Nn_ow_RmFk>n4wfG-wYf8^fY)4L;DSwyqN<
zY;Cy9`sJlJ52)7X0TrfTr7%eZtBOUwT*CxDMkUKX*UpKSMwfAc3YBN)Jrl#2eBbl=
z`zC<d7hm{u_J?)1P%K$(ORnvD;(A*Go_E#s@~;RkRG35zJoojwjC#&EINywP{Jc$)
z6n57mrh&ED!{@t7bCy%Yri(QJN`;C9`Eep&6%x-4Y<y;8Tq<Ja>G@~(g55(2ZIame
zNi&g_z4>YDF{+ufKk!YyXW9y&YAPyZv>F|&VFcAr#f`HgYImp8)+<!^F0MgO1iczV
z*e-N?>DC|`Dte!yyHl5?(2f?&31hu7n}$HleRwTKuzy>HU|~@(peE~;%h0Hy`m^P=
zhr><^{Ftr09O*UHAJOMl@^M*AD{UrXMRRYSrb${Q^k#X#X7sz&Z6=Q`J@0FNf!{8H
zXRH+rF~UmO<h(9E1nT71i#xy<z;(TO2Ugk|oZ1jByqC(A#`VyJ+g!A@^G$BbxN(^B
zyR>JWg2)+kihz6CFsqAVvHHd})IJ{MWyg5V<^+EkY)~Y2d6E>>Mg(`WE>UPI8_^U;
z<&->Ee<43E#(XJ%{KLoEqv*G3T#ImSwkQn|4I`3%N_6%5D@xb~vJs`|DGNOf+0EML
z23EC^f`1kKZB!uxth0@l1?S23E!L?<8NuGH!&?+?lRgD}i=|w2il5{!jDZHx5<<ro
z@4}k7xoBiQnF4U9<houzfAKmlHx}pxm6(Nm%-sSH^I@L|m?36ZM!eKClz5od|4@Z3
zPh~*xi$|hs;Lb_=#vu1g;W_AZBZr>YY5jWoY*<S9K9ufJQlc`YA@ET~8y#Lm%0e$0
zEwNRppP%YYL$G#YMhTX~;LdmAZA%3-T!r^OcdMyuE}+zt*u{5(d=!iVPeNK$9iAoS
z*eXdsmg&r)Y>1wFGRi5Sl?{K~HuI6xL`(r}>#8Vl=N!w_RuT@!<drq6ZPK>e*kL~|
zaqF0O8$%^yd?PP^d<6b-zzp7^3pUh$_|CTnm^m@^%{u_n*A2k;yE?XLWgWOpIYu@~
zFloZXEO^T0x#lCNf;Xd2Fcs^_-BNjqhzYlnhRRnnO6TDx)irg{XoQqcR+I0usE&w-
ztly^7Lh%&kHaO)eT0eGEQ<s;kIh$pl978pobpr3=S$nu&yA!-y5EXH3$yB5~?7rB2
zrD<f@ngsd4%fHFd8|>4{6W~R?(9K8geCYDXsLgku=B0^VzhT2;-EApqC>e8fEk$0_
zE!>8oL0Qne-`MI?>!Vl60F07*FaFB%P?Hjac;liK7r?^yB_mfEFjE;T$+Pz;Qs2mO
zHlg7JUZ2DO*O~-5c!k8CyvJ|*NV>#~2f^V~@W36w0|3AKC%}I;W1o24E)SRxb!~yV
zMWA>pfR%k=lT}W*ro-52AU3G!wWEyBSiOC{f1z%O0fzU$T!Iofek|FBA;if2=H@2c
zeGY(9UQ!}@AU8jmB-%&}R=!z*vHIDO08^ShNSIO$sL4Wptc5Z!WH`_>`ME<Y75#JA
zliu1R#cYfUV0T_O3v6nP{q6w%UzsFTVlY&RPYYn$0)}@2Oc}vCd~LCR{=%xIe89`g
zk#=b*FGci#v%O7hx=PfRbhMqsuFqEh)HptU(s-Kzwiz58Nc;-|AZE%&11`|M_(jn-
zjHzxYVf1X^;Bh2R@~OofnLspQ>3pVq(QB`gpyb_qLTitf?Y|WN+kzJJOV{>AFwi2e
zBhS$(J}7kF?~2CyC)n7tU;y*8Fqp~q*lO~z`?NX?M0YaZ)-WfetRn9j%<DsAKPKN`
z6c!sCB%})_HV@XMhne%BWT^kj&Hm@8pdx?K@*;OAIn%Zcl&khxKbC3j#A}>n4d=l-
zDTnB>WBsm<t1e6KYnK!6N)_)j(*^qk(Rsv&F~N5JGBVP@F52~yY5<mR<Zpgb04&`q
zB}Tq4h2&|fdP*#uriRK{7uvMWwUvCa#)lo`N9xbsasWkw-ajC(IbyW3=Gkr_KPJMP
zk^#<5ReHL~?!fGKG51BV+o}Wh<vk(@nB;Rf0^~sV9^mjiWB@vgK>$Q{0-fbU4dR-+
zh5NNO%vk^VaFB>3A@QerEx)D-ay9y`+D8w6Z?Pg}Z3;o_!pDzg05lnIVFJO6Dlfn0
zNG;1YH3ho;frnQdHP8~z0Ah`lWTnp{_yq95h^ODJ<Zv)CVoJJTiiCMxQt3n3PxYpZ
znmV1z=m8$Z0D}X+ua)gq6>H_Z#o=D;(~hFUrhOOsX1|J{24|ndyr7W6fURBvHsC*;
zT7~bx*ukVD(tOue-x_OG8?r@ND5SU|+Hyu@ymFG*Q3qeFr>n(Q!r&D!!(G9*Z{F;P
zz(&!>oobMX?3LUwW`@Hf->zVpc(7{bQ!F|v*O3<$W1O3y0NoSQ=N3Y-)(kkcKL`Jr
zunSJPrfV=>3wvqM-*x*#43pn)Cc{0)fD$!QvLnW<=;_Y=7NHo^SNBaVqSgSYg}Itm
zX-&CrR7_39Nt~)YZ$nysbUujz)-@_(;#y77RUIUAqlr0}#x58^vrE2&xO^Llnd+F-
zhbO*a{(|Svq&OY;Al#SFd=Pb@_7ZE5KSjB*?j18W0)iEaDo#f!r%bR8k}`?$=Q|fV
z-4=sjk~edJo3*VI2<r1Y!bBgpk=%EugW)BKii+rgSB<W8u3{+}V}KfCVK%H(<BsTq
z4;Jvh{(5&27xHf80aw>M&L8D@9fa*uCbn;dzpS)<rV3rlXZYH8s1hX>xgMMG&QBTP
z7sm8M@#go-Kx=j^Ya74j@V4K)ha-j!Hk6uQ0gOtx0`R{ccIVXt5W15Hp8dtVyvei(
zs7}joPXH6~Y$%VVJ$ma4RL$G95|AYF`WD*qR@2%u{WXe@VNoj3LlV#w>^jjUX|PKy
z1!DXPZ8JA+5Yhdp1R$J-X<)bNixl|m;ZPjpOh2&z{e;M0KN7kDbSV#3mrh#FgGj{T
zWcv-xi-nPujMGt-F|W4g{9+&i<P`kR)o4X=`9Z%2to2BI#eP?Ko@N=260M+hd~huc
z5GMJ`e?^%5rCJzm7z&~=m3!3vP_+BRR3)~nw6Wy)eUrH_5J)|1Fv!vVvNG)E*Ee(#
z0#w`s`hX`Zo&F3JSkKI=L4u%14s76?kg(==-(t$0IXzYB(`T{Dvwi$m9MCp2i|@x{
z<kbStS5P@t5e8wI9hLulpha5B^Tct7Pswu&iwXoQJ|@qad>!bq7M!x9y$;rK>fC=m
zSm*MqmE2MD#YzQ&Mb5{}^1GQn(K%JH@esj@D*(7Pf?#i*c6U~-09Z|nxyz)$hQ9-I
zez%+huJQ&d$`O2!z<s@m7Z_0<BL*0caT%1e{M9@F?6C@f5TgqPTd6#?mEe=h8q5G2
zHG%-O|KStU$v|Z}E`(zflBd*W^ft5u*Oe3~iup4|@e3gCORyhyFh4ZFV6uy>Tzi6n
z89N3?%Vp9b3f})!3y>>!`_85MtT=uN&WqQ7VMWmv>8lBB|1G9#bWc@))RW_2r5=3a
zf;Ax$cM7l)V^Q7o0tqBWQ5pC!GMpC+aIv9QnN}=l%t~8M{0jtaUuw_Z555t>O7orT
zTEO~0FQ0hM3jxX^6HAwXIcR~W6)Zup%+<HhG4PaMgl8_!KGP@r=Vzje6<Kz$kml0C
z%}>1m&Nk!nre0wkodL;DK|th4MJ?fYkb(nMN-)zY>=@6TZv>A$nwfcy{lQNQ_y>JK
zWWlMJ-`)-|6Qt$Wao+=_|B2<;<_(zheYQJ4BG0w@AAJ2Pg?-^ZR!2<7^BQAktdE_s
zNoJE}GI;9>R!0b0(1C`uVa-9w<u!mqKFSI46~w;$+S$voymH;GGjsR^?1??9BVex%
z1S><0Oj0p<TlB+s(v2gf>FK*JD;LV(@JGvLWMwJ%h~5`|LZ5D=iUbs#;u2Zle}st0
z7y`(`R0`c-bf)`<*>!#HP3!yL&K|!awx78Q-HkbMZ!DvUwVAbf+>RwnQ}w9+FM<IC
zyOL3v@C_K;iuC_MSjp&u*J56WHD@UT=eOVgxlW;hZ&>CaH{A^^ZeKD0VW861mBu0s
zw;sC=!@k59!GCG)`p6WD-x;_4e&A$dDhznz(M$UpWslVJf#p|Rac3QAC#gZx|3?Nz
z*NHiJ6gPK17+{E7Vb?7HOv>`n|8-Is?dO4?XH)+l`9~FG*B}bpm2U=ymi+E!>_xn3
zGaLS3nb(n7U>A4-can?;T!o!i7K(bRR1QJa6%H(1M4!p~B5q=017@hVYp8~H6dy{l
z+}^B}p7q=09ee8<1vQzN3^bK#!0-pH!Eni3mi9rz5XCim)lzv(;~=qlvxoMwcYX(v
zqQYXTJ7Fm+wby!OR*mFUfu3-zo1p0|r^YL*kg#rC9?b^W=8jrEpR%Y0n?5oCEfVUK
zyKmRr6?J<yNg@ZPRz@uHIx-6Ez(J6^sfFHJ-%9PqUDV$58qTPRD7rVkB}o}l7FT!6
zkYH2XLRD+Tk}ICHaA(YY5(d^(WsKG_>v1!a$7(;Upd-KFS#5s(euXIGeYOh7Mo^lU
znx;WUO0aTw*wVhhw;>)k0D;Gx_u#k?oLM<`Ts{!t|4jN9s!ng~Nlf*g>5}b2{{8iC
zG|BYce*UF2QFfE|?OAj&Z>s_GagVp!?&+@vtD`+?HAwulnF9zK<#y*PrH{we4}PvG
zGTKfuW70QWY{K$_j1GPllP`bBdM>~IK+avIM#U$6T4D4L_s{9ZEAGPtCy?tB@Nq6)
zHQ;OFx82(nfhBQX;P%X2ww)Uu!luJIcimKiubx9izrk(4<&;cNw`8*R(PdN*GwW?C
z$~D<Cxn$4gKTfXDtxcr(y!&D9vEh%wU!UK!u`G-IBwL1eh>}o5e$SyMCFArs&^Sh~
zw@%-kY@KH8G!#)+a1^Xb);vGv>zEwD)%qlS&GD2`uc6A&M*thZYv>0Rcy4=Y<7PD!
zeR=nDxGN@#OA2bOLt$wR&+PK4OJSdAuUXPu=JRyu@6-c9MmCXsJwxl%Jr%VY!pbcL
z*FwcMlABs7x0zs@Z{G&~VJU#t#Kw(uKx%1Lx;W0_MzAw@t(q<;i6CGa8Y!YvGzvVA
zT%I*Icy*3{Jm>x6_T;0OkCb-0>Fu?F9Is9{s3G7iK~c%w`@#nF>BgCPtG`0g^z<30
zDk)LG>A3gmCh=aF$#*7Oxf6_v$#lnFyp_sY$u0YDR;s5qIs@z1%cV@FgBk@#Wh@q0
z-pX4rA5e-7_W=IZncH16kZ0;kbU+eg#Rhxp)JF-C+E@%|n&#lTs8%1G`J7@in477v
z)gBG<7;K<uktUZzRA<A!JlN#Rq#o4kl2Ji|SheC6jl7IK5zPQ7y7(@&W=<KPelT%>
z1HZoz)7-Kl=|~DHi*8(<T71p%xh!-37qCxw8)(yAp#6Qdfk`kRJTnQ|S3v{=H)--P
zz?u--)-Wc4%t*^&ne5b9xt1T>Or>5vtl!32t_b$@ibx?YUusW6n(Ta=7L2d+Xd@ve
zWw0=nGO883|D(<{Q+baktw9UvQ#j?ORx(eieVnT8&9a6W_DY|eo}ocfCA}KMOLKP5
zK#M(^QLpVtO;TiZ`6e4td7`ep9oKrShB@7ND>G?*ymfwb_gko;HW^gLurTzty6p3L
z+mleM<E=-zxewKfi?YL{shcYRf2pRYUYKGn_`?Z+@CEhVuZ$Ur(wp9_LL~+jJ|gwW
zfny<k@U8*(HL#x+f@-6wP>dsbx;^R23PaNXu~F+W>WWuJ!XfBCzc^S#TK-DfpE4;x
zImVB??!!3VfKKpUVN7!mV$eEoQfPK#Vq}ikdPTejvdeK0?A@?I5sI38_?ZnXZiICM
zV2~C<Tyf6d0PA{#ABDx<^E&{0mxH@*VzGA*7+bu4f0zA_0DEL32s*_{mu3doQqO5>
zlW%~@@E3BE+15s9uY)B7-rnMb6d4*lVAI<1on8F9%bP0ldcC>h6@8H>BqBSpJc8VP
zbnhCQMPaJ&!TS5@-UrIFP%Jr#+K5$LkFQB0?X_pUe<GLwb*aiJrUtweHc5&SBhkd>
z$lMbj6!wM%?H*}sHP|<-aNR=tVpmhI!gz|e#{1AuoV2`_nV!CH_NI2Dhd-4L+V4b(
zI#U1>BaKdWB!VEk@BR7`o4Ng~bEhU(B7gOljl9{oy3Z;|uo~t=*%^E`qjP+fNXygZ
zCb!epB~+QhjG<dc7gc4(=DWn@DmMk)63b9NC&u+?odH!-)Egn6^{&l5khR^^=CJFz
zfxOrq5p&%Rgq%OU{)Gx%yNz_Z?Hvb$?gS0XSNm+&Z7dV5)IOR}Yzmtnp<G;{zLOkO
zQ{esc#w-23EQ0DIQFHNM9F31C12`^lyQ&I@9#ywG56q*tY1S9U!g*1#j%brzS#U(r
zn_Xl<w9qFvw5Y}{obcYmXehp6L<sHatxtRekUVyUx!TslV3c|^$Sk4VV?!|dYmXsf
z>0A>*LCO&~HpUH$$3Z)%vFDNSO-qCN+W{P{iKmH96!7TQ#S5ZC(COhx?erjBA2f|C
z8=HpO&j>cQ5lZ*wL+|27aZ=zH_Fi2rd(Gj8&fvd@0CM?@>&)ZCvc~_MRCB%tLDc`$
zlV1igXU33^Yvd{f6L6xx_UoyL=tQtbTN=&{(nLN@0WXK_RsE+f`q4E7$M3h3(Ub!K
zKDGQ$jwTI?KfkfiH$)~kYVM`4<E!okP;mw)3u!437W6lEVaZp%Rkh+-)mDE_SNhKK
zgv{q{X$UQL@sEmzO9rYD%+yy5N9HB~>(lX)`7+9Mz$P}5*15J;<}sLiQxQh4@9_Am
zWdd3<<_)gU@`=9c-i#}oAC2XFIL^U3kr!0tWy?f`QiL#p*#Nqu(SQ-?$bQ3~<f950
zEM;*IbExhY)=XRYNet+C9Ti=vA2}ex-@rMG<*+#x1A>31!-m7=K@Ln#$*2ZEgO)XO
zz5A()Srs|Z)}N|-<c52P`S3oT2JV@B^MKc~5BP5w+ZKx$b)*0lweMh$i6DOxmNwZ5
zCztX*Au{*rqQEq=D{h-iF^K9zLlY{pHp7jtU9P}vh{s;Y7auT0J8#x=JF4y>=L%Ce
zv9$KeraBE*9yu950R&NN9L<=qw>9S3Bm!xfte%QZ<vV=GZLAqmHNNw3dAQGG#tS{B
zB@)+gC?&2c+nPH&H_&Sx-fRDD*LEIEkKKI;MS|4XP8YzL0-nCBAU1%w`0y&|sT8fI
zsmXeHrJj+5*q;Qu9_MhnujdrTAS{bw-@m`*1`fa^@i4#e^L+y{KJ}NKK}$7i)j>OF
z)R#T$=_6Gi$YfdS`nVnUI-R1U2&abV-j7eQC!*#+Q#Mcum9Ivpw56_1uVhVxvao}!
z0NJUV5|rFxvGu|M2?go+9FA?WSLNz{k<FemsJ9s7REw{Dq(EpD#&D33$)|E$CJ$_W
z(AYt52_Gpkr`GaGjkt;-$3|gQAP3-PS(=IuzAnZNqI5K6|10x!_|A4*3e<Jf7?^zo
zNg?ENWbMu(v{fnk{Nls;?R_Os6~Gf+W)K?ITtmls=Z8E9`u5JEzfbAd>&v(f_cmfD
zj87Q(d##^Jo<)P7h;hO-12dX1=6&GWbvIP*4<CkhVl$;QO%Tex=ZvJ5irE*4>zlB-
z;sYAoS8|fWQ@zwPSHE*Xezh9whb}P`8j&1H!igZl*HSnj<n?1PFIzgr&oQ>J<`wZl
z_@K$J8|y_PY6ZD7Q<arr(oj@`$EGFH@+UsvW{QUwH0;v2ip6!~uR;dqcUzV9Cg(<g
zT8GPq)Z}MXljM}<d0SZKnS8#{TD1U@?c@<dj)|7I>wAZMb(GoF_VM0;>1Q*or=bgm
zOX*5{0)#LA#--n|_Y0(PmjfMbnGd`+V)OQkSg=(lAezSYTS89u*=!%*2mQqlCO5nn
z-w5&GU5o-T#Hh!0GP=&S(?MBZeoX}j!M3#Q6|Q9M+WSx3OC+XcXH@WX+9G&Ww7POc
zgq5fb;YYSE0wM&Y$yWQrg%}|fy5OstG4-B#QKai;16od=l=ACQg{Sfx9!SO)1W_B=
zL~zkzjkcN2UJHPxSmWXt<L*Pp$ZfniolxhG^TCI(!wF_I_hjpb^?KF4>70lt`*-iE
z;pSySey)J8xE}Pv8k?0=#rInPAdkN><A@Dyl`a{8#@%t^J`Fc_;_s5mhj7ahsFP)d
zVCVY1;}XK#*c7zga%j3H_;G9Wp65kxpd@)bD8JCbD7?|FeHA_O$~e>$JJCmbAvK-d
z#_GZyucxgxDfyn#1q;`#9A*d9V#GHaj?)nLEc1YCfps}<V8R2y)tu6A33Y<@2G<_E
zee*7+D7|<nc97ZSHpiy<B#Pf_?iqr^)1hR=5;fzo_o@BZtXv|H!;faASH3DC?}x<~
zQK7NVqMpkhn5Mqt{qZ9Mm&mKHIqq8c<w~`Y+>+IO^dS|yl^6fmN|$LbZP28HDX_+n
z+awn*J}C3(47?x8@xRg<+_iSDV_ZyhL}2eyF7y(9M|(aNyPKWY0gEr3tU#?ks4-f$
za}D^Z%9wD0$=0^Em6KUPvk$;+<nugyW~#auo|ERRodYFPjxIl~(%L=VgC5H@g)xEP
zA#FWJReEymzEekIhIR35y?~4B@O=no=50}ridL<Anvs4Z;fppBkn~qkyQFNfnn!hQ
zYG9qIJXn6s;I_}=<+sYqb9*%j8er8`@s&V*$fpN=S8=WZvdNYC4>jcpdX{a)HXcb6
zx>V~UGEVnn398OMx^DIj2k*+F0xG%E1l%cf-+t!n&_mJ|goG$idS&GFfB<;+nOqLA
zTh=5?jkbyGR|E$yvDY(CF^j5T%BMg0U4>}0cA6$~`3r5Yv}{LC%lAaCaJ|^m7&vAH
z@d2Fr($GEFDBq!AT1%F~AL$U~ZtLMMv9Q_Je*GT4x|F-`=@dV-qQq49?r?&ViwL3h
z%fqE+YZ#RX;I#hOi%-x36Z<QTgEZ-!1DAY6%^()#<paz9rg7w}Q{=WzLgx@7>tW#x
zqq)m(q)kY4L4>#}jT;F3EI2>#q#~ktzBBY;D;bv+v#Z1?J;oj6SHDl&ewPT~sBQ3p
z2%H=eie84Bc2pS_M*=P()YzdhR9f`=Js9M8uXFwrD%5GRo8d&|vcW!bxH_74jI+}q
zPG@61CZIL?5-$vIM`w~*SJQxTT69P@k6n?eK{*RXv!{ry;KC4-e#yO8lzEj-QBhuS
z{hu;9pvnE?(@b6=y{qm+P<r`40r_!K0BRIG+zz@FvV$gdmTUdbk8rDi#C913*x^E=
zl(6M5b6b0$O36_)2b+)sg_`o_>oUjaCr!+XpdL@)i-xQM_5M$<;o(|D99wBS1v(iY
z)|U3t7ZiHNFkBjUqQz#8UU9ierOm3L(78Bi(Y8>IXePI!VnG5(h1YSF)iH|hxVpyn
zMHi<c<6WNmUMlt=qP8t<W_k0e1^1MYMoV?k%4Y#UA}yQbYiRSc5SQ~=#+ZBY%rS(?
zZA%ZzrXOD}$Au7l$I$>)CK6$MuOu#D{Rq4>KcX1NWeQ%oI#vxOgo(13hi}UL)~(<3
zF40>c<XXn`GQc@H&>J33&ZvIf4omTkDg-FDr8x}<4%X88<Y&NX8TbdILAj*g>ifNq
z8qw5!u7FO3brpiq=6W%7{gK75ZV<yPY%o=;JJ&DbP#|*xh-{J>*ry-#zy*WHppinm
z?&keoA2nj;6d!e0A{|f4)YPGLfG0^?cTfc$m|C4r2#1^Z%N*lojXz?3L%@xoYUQq0
zdS!>+?SvkdPJK>uK6(7ORUVMg$45&EWC{+8pNpz5p8<rD3kJ$ukTd0&?3+&1UWEh?
z_RPzsB?7QZ^AqQ3NEp+(H++7+SFlU-yCSt12ONuu3Khf1U9CM;>rj`y<B!$Ez7qLl
zaxD6Ha%)fML5JUn{GysX%^v^H4Bww(QJ(}*pV^ti4it_R;Cs+PQ6oj61oc}|@@B*1
zv%IOekO2veM#~oloFL<7&^lFg`q+GY5jJ|XEQ*N(jx=Trap2?ZH8qfs=8S<Iukw%9
z?kJB|Ziyb7P4m)5uFu*!YvP}2DS>!P>p~9K`+6N$D{94V!;gm}#@2NH)ttaxFcGaZ
z9!iLelskd7M^vn5Fu15&E|CD`Wo)n`-Yflx(uDoOdS6{rApqu=4Su~Oq#HWkb_X?H
zpUTsv-sYyK4LrGA-oyl$Chf0J{W&Zo0Fx!(Jsc`OH5Ll3)!iAHRm+gtJ#Z%vxGD|m
zSfQcQ*viY~O79A*lN|#PT@RQgZ(4iE#fFQ=cG(M}oc9V{WrrG0@sITtVxK##hHOLO
zFR76i9SV}k_!h-O6}l5;(XtjUVKuL^GeKrsG5B<Z_8&7dIwUzcIbkb4Zm*&=niZ#f
zTOTv_sd;F~oLQJVPB{F1&yS%Ix1e0$dfv3JEiC~yEIaQcu4AYAbPeRzL#8|mX{Q<<
zV~c)ipVBEVhHEdKYPlp)O;u}5gX0xbiV?!sB68k;m2jbr+8ubWNLUcN)#QQRgvwKw
z?erR)M2(Hzt`C-)lvY=*Gb9;9_pb5Ptu&_yyh2-*wA37rFC*u-r&peXL^$2Dy(IL(
zV1$2ICR6#!-ap}QV=g6wOJyM76x+?+D)|oe09~8V%fF2YuD^tSuY&yeN#Z*yA+-%y
z>YX2aSUW;$4f^h^OlQ?N8-XHX!qC{_gl}o~oWP6wFqWM(#9p$r91fC?kLc;sO^%?H
zOaemb&CF75eKT`Kg#RSriu{x)<Ol;t>D0m#nJ$Uyon$9+sQCUdhU?)K<H->tGv)W;
zC=H>5xk#QzpX>pd{X%g}jV`z*%4_`sHTurrdT=BS+l~i$fH*HBY*UuqGnsRuLL#P7
zSYD+{YDnn*{^=B6`y6=rmm(3Sf1r&0d~qj$$AAPkzX&A?Nh2~s@Z}QtUjCLtMLfov
z9AN{KKH;E8Vkfw0W>Rgkj|hj84J4&oCe`QwYv&0On{^Uxd;ith;iz8R;GvDcR+c3(
z@}g8Zyb|JVR$1VB%mL6?z<eK+e*%xxQ|>O<2@>Z?)_(-o#mCQ1jt;g!A8a`jNf)7!
zLz?h6Ildni8O}lx+>6i;6}o+uVr)FJ&TUC9Q>>t&CBA#Cldl8|7GDFeD3CWdjvxKE
zS}qLDO-*6KhNqXIn5<D>uL9||8HT7PTD^mXOC%K3+|f+cY9CEA8zN<+E9zCk08dCb
z3J#7)%3u?7?(WW2Ko>WWzhA~GY(ElLa19F#H1<^+ST<s{7Uy~Tt&b3^t<K>*B9-ep
zR8!mi7IovvgGWW+j)M=o(zCd))quKE5`RzEIaEv&0kYKww$J`nS`~L~q*WJG14#-U
zm4fogyZ+-1`GTEGRX2rpvo1x1O|iqum|=JnqF|YpmPdbreJ3=0mW8d4y{PbPbEX+X
z1RE?}+p&p>UmB3$R~GqWjim1&Q6(9+ZW=Qgay0J0&M$*btxqwjsux!QcVtv?^)mrP
ztL{Xdb!02V@)^+n5$H0CKDrgFvd<V*OLVQa+(|$WJOo5YOi2NT3$oTU@Ssm)KKks3
zCx=DJPe3*=;{_rDn<E8`s+6!a$f|5Q>g)W^65fj%Ki)DYhk-_{a*#0{p(r<HiSaC+
z---~2Gpy#YIO6F&m2S~J{pz_0MV!fpYjn@pvGO&K`=Sk03owIH-5s9ic+2%nth2Y?
zz6s)?WGTOX?|M#dQCP(BO2*MI8}t6z2XE!gEWa!K$Om-<Qv<_fco5I%8KH|upv-==
zCpa5Rl9iB+AWedgHbZ@)($eTzt@m!QqPheNzF_>HZGjc@4Y@?J-4GV#bs9z|70?Fy
z0x=2;kF}(mnBnRG#)yZP0GzQFptqp=;R)(czk_V{KU6DU)CV0;FKvMvewUqtMGWpR
zQFr6I((;|9Mtyxr{i@QUBPx!C<mHtk18LnQgq@b2WeTC_zq4U68}k<AdO~z@bwJfa
zX_{#vsDI=aw_y08j7>mZz$W)T+4^A(<csda+!g9ouUL({yXe|$XG}CmG7|XZY#sM`
z_uInzv9}iOtPFGCn*4qjr{e6S0STx_`QLMQ41S;u60`w>Vu!l;?B>f@KgGko^qijr
zzLUPFrl8xnb!9tfDt+N>hxD*Jl!+~C{l`?MD|qA2$uolO9@%>_uNv%yF~dESzhrX}
zrLEFIEjQB6(&ZBQq;FU!*_QQa`)5=5%p>&SgPMiTTm)2q@SMc|bj;ZJ=C?|EdNJ8;
zfdcZ;EZpYs*+y;V{Zf}Rc%P~5Iy}Y&FILt=6rz+$$RT6GzLokJ-F4?Porx`T`^VHR
z7&P_SpmF0rfur{GAnqSpqeVF<MU_*Kerzzib+#PmUCxsh<%CH`13y*ou1Ye&b)4s{
zSaDkY-xITbAFgs3)aeY+j%cu=oJylAQf>{Lm{I*+;(<w9Imyp1fcsz?HgF6{bn27;
zaOZl(*<e-v#b9ebfos&}z@<M?pzF7STgOwO2=A9s*#vm*jZSBgJE$(iyM)j9^T#lW
zBP?NOsaGmXDUSa5aF2-l;)~!j|Flf|<v#`ApDFC`^f=>{wz1%PxQ-xFb|;_LoSa{t
zFaxQP(82vmt4oh}r7?UYBE-FcImHu+4JSb(2okbw(+FHXP+MoT>-{?dKhF-Pq_tBp
z$OJ0|3I)L4^~%|9>1s$&)9!*R4uZxe)wr$XyUE_ZvFa`TPI2Y~vXcSkscL30BJ4Ee
zq>>GIR0WC7W9AXj(St$EE#HZ83xIZx1La_Gl;zh;=)nu6dvu|aWR~6oGJi|oGvaJ7
z_6$Go%u-HpI_2*jur4J!cMkWb{NqO&=4#;&CD<a6865Dc0U4&(Fny;WEXJN0`nBar
zk?7SuF;X0o*YCZp+Nfpn<CC8QxL$1~%paIWGzdt$rf~l$Qhf#e-i(cn>AL>mM^v0y
zc+deI=`i2n4%KA578Nw-r$!yg#m}>>Mx9V}s>et_B9-QKCU#Hp2ecG5Od1|*(nRG1
zdA=>^imjUACsD5{pK6nNcZ-fe7BSHJN)@PuGHjfX@aTWXNr}G%xLglhKCLu8+IP>+
zp7xotQaqua1%r_lkIm(?f`#i3z<4`qp3eJ-Xgn-`_@e8U>Q4n{=ZcTk;{tX`{UkVq
zAkDy|rvo@QwQ#;q1?Mg3e0{@xuj+!E7nmz2ofA~O*y2F>z#m-8NPA!6^%Mpy1}`dq
zNK2Ev!UY=tM}m7cS=rby(~ZHIIaP8BLBW|dHNs}c{rAqc+U#OlS{kE}5X4ZY_+;&5
z8gz7z0asiqT3K0{U6w7z{iudjJpPCMJEn}Loib$|1AvZG!~xfwWF^9fATeVfkUjuo
zCvKa3I1PZ}-e+re{A~`;sP3~bBiMs+S@=yQ*nOb{10FHsxfPTS&`4FDsC4<anEy6V
zfGDB@I}Fac#PNdh{YU{z?PKsT#J3o{q(Nbm3OndCZS=<%0kA}+%U|wsVD&Y&*i}H6
zAYg_C3|f|xIr4X~d*&cxzu*FnImS*OzVHtG_CBbGnMom6Vfc{`UQ(>~5NqH6`@?;e
z!APuX?h)XFUoFAb{e_UdN_~JWKUVE*hy4cGnF7Op`JxKAFDmUMA?ya6fJyEzHuvN1
z19xpE|7Oqcxbtr#f2Iybn7~EQge5nvutlor&Cgg6p!mdoz~Mf{e=WxEk^9;GPIa1g
zgYo(PzvceD+@C~2`uBj>6uVY_3zr0}#~4v5E>mvV2Z3&YO7=Xf-wC<`JdwWxSV=bM
ziPxN*Zme;(+0J0eOZzSEkud`r`N4D2xU(Y<e?x+V4t@c&Rtxa6<U^O0Xt0k0mrZwd
zDN*pgP}(Ni0?glkuWbH10R8PL=WxWa=^ugr4zK_I>DjMc2I#$Prtv>8`}f2DQ-S{v
z^!@oGmi@TEm-oNE2-FN+_{)KU4}ARq*3aM8{PUrI569oj`DX#tKl>2;+b2-S)qF05
z?SFgGpRUC}ti>O{^$hDc`^u31#XHV+_D_TJ_xt>}b-Q;RXi<sutA8)ufB(uq4e#GX
z^zXmB1>kY(gX*Qf!_wd0arWJx4$c4g_dkCm04k5*d(Q6&@9zij=iC0QOZ%%W{C_a{
h2T1+@2_{G4-`?#UsQvQQI0yd7OFwyBB>C*+{|C7a3XA{%

literal 0
HcmV?d00001

diff --git a/experimental/torch_xla2/docs/fixing_op_info_test.md b/experimental/torch_xla2/docs/fixing_op_info_test.md
new file mode 100644
index 00000000000..03624f9487e
--- /dev/null
+++ b/experimental/torch_xla2/docs/fixing_op_info_test.md
@@ -0,0 +1,211 @@
+# How to fix an op info test.
+
+## What is OpInfo test
+
+PyTorch created a list of python objects (OpInfo) to keep
+track how to test each op. This is useful to us because it
+ensures that the ops we implement produces the same results
+pytorch would produce.
+
+Context:
+* https://dev-discuss.pytorch.org/t/opinfos-in-pytorch-1-10/253
+* https://github.com/pytorch/pytorch/issues/54261
+
+
+## How to fix one
+
+### Remove one op from skiplist
+
+Open [test/test_ops.py](../test/test_ops.py) with your
+favorite text editor. 
+Remove one line from the `skiplist` set.
+
+i.e.
+
+```bash
+(base) hanq-macbookpro:torch_xla2 hanq$ git diff
+diff --git a/experimental/torch_xla2/test/test_ops.py b/experimental/torch_xla2/test/test_ops.py
+index 72a39ae85..2a156cbce 100644
+--- a/experimental/torch_xla2/test/test_ops.py
++++ b/experimental/torch_xla2/test/test_ops.py
+@@ -15,7 +15,6 @@ skiplist = {
+     "_native_batch_norm_legit",
+     "_segment_reduce",
+     "_upsample_bilinear2d_aa",
+-    "addbmm",
+     "addmm",
+     "addmv",
+     "addr",
+```
+
+### Run test to see what failure
+
+Error gotten:
+
+```
+E         RuntimeError: ('No lowering found for\n\nTo execute this test, run the following from the base repo dir:\n     python test/test_ops.py -k test_reference_eager_addbmm_cpu_int64\n\nThis message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0', 'aten::addbmm')
+```
+
+From here we have 2 strategies for fixing this test:
+
+1. Add an implementation to `aten::addbmm` operator using Jax ops. Or,
+2. Add an  implementation `aten::addbmm` operator using torch ops (this commonly known as "decompositions").
+
+Either way works for torch_xla2. For ops that are not "Core Aten" sometimes we implement in torch ops with the goal of
+upstreaming this decomposition to [pytorch decompositon](https://github.com/pytorch/pytorch/blob/main/torch/_decomp/decompositions.py) 
+so other projects can benefit from it.
+
+For illustration purposes, let's implement this op in Jax. 
+
+(NOTE: this doesn't stop us from upstreaming a decomposition later if we want)
+
+### First Impl
+
+To implement this op using jax ops, we first find what 
+is the exact semantics in this page:
+https://pytorch.org/docs/stable/generated/torch.addbmm.html
+
+From it's math formula: we can implement it as follows.
+
+```
++@op(torch.ops.aten.addbmm.default)
++def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1):
++
++  mm = jnp.einsum('bxy, byz -> xz', batch1, batch2)
++  return beta * input + alpha * mm
+```
+
+Now running test again:
+
+```
+python test/test_ops.py -k test_reference_eager_addbmm_cpu_int64
+```
+
+(NOTE: the exact test command is printed out when we run 
+`pytest test/test_ops.py` so we can only run the failed test instead of running all tests.)
+
+We now see this error:
+
+```
+FAIL: test_reference_eager_addbmm_cpu_int64 (__main__.TestOpInfoCPU) [torch_xla2_diff:0.001]
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/Users/hanq/git/qihqi/torch_xla/experimental/torch_xla2/test/test_ops.py", line 654, in run_export_and_compare
+    diff_output(
+  File "/Users/hanq/git/qihqi/torch_xla/experimental/torch_xla2/test/test_ops.py", line 617, in diff_output
+    testcase.assertTrue(
+AssertionError: False is not true
+```
+
+This is telling me that our implementation did not produce 
+the same result as the ops in PyTorch.
+
+To debug this, let's figure out what exact input caused this.
+We can achieve this by setting a break point [here](https://github.com/pytorch/xla/blob/master/experimental/torch_xla2/test/test_ops.py#L644), right before the diff. Here we can 
+inspect values of `res` and `res2`, as well as the `sample_input`.
+
+The sample input we get is 
+```
+SampleInput(input=tensor([[-3, -3,  9,  8, -8, -3, -4,  2,  2,  2],
+        [-5,  1, -9,  9,  1, -5,  6,  1, -4, -5],
+        [-2, -1,  5, -2, -3,  0,  5, -4,  9, -6],
+        [-1, -7,  6,  3,  8,  3,  8,  9, -5,  7],
+        [-3, -4, -9,  9,  7, -3, -8,  2,  5, -3]]), args=(tensor([[[-2,  4, -2,  5,  8],
+         [-6, -2,  5,  7,  7],
+         [-8, -3,  2,  5, -3],
+         [-4,  7,  0, -9,  8],
+         [ 3,  9, -9, -2,  0]],
+
+        [[-7,  1, -3,  7, -4],
+         [ 3,  5,  4,  6,  5],
+         [-2,  8,  3,  5,  7],
+         [ 8, -2, -8,  2,  0],
+         [ 6,  1, -8,  8,  0]],
+
+        [[ 2, -1, -5, -8, -9],
+         [ 5,  0, -4, -1, -6],
+         [-6,  2, -5, -2, -5],
+         [-5, -3, -5, -4,  9],
+         [-3,  4, -9, -9,  7]],
+
+        [[ 2,  5, -7, -3,  8],
+         [-5, -7, -8, -4,  4],
+         [-4, -6, -3,  0,  6],
+         [ 8,  0, -3, -8,  2],
+         [-4,  3, -9, -6,  7]],
+
+        [[ 2,  1, -6,  2,  8],
+         [ 2,  6,  4,  1,  8],
+         [-9,  9, -5,  8,  3],
+         [-5,  0, -2,  4,  0],
+         [ 5,  8, -4,  9,  7]]]), tensor([[[-1, -8,  3,  5, -8,  2, -5,  0, -9, -5],
+         [-4, -7,  2,  2,  1, -9,  2,  7, -1, -1],
+         [ 1,  8, -6, -4, -6, -8, -7, -9,  7,  4],
+         [-4,  1, -9,  3,  4,  6,  0, -2, -2, -7],
+         [ 5,  5,  0,  8, -3,  7, -7,  8,  3,  5]],
+
+        [[ 8, -4, -9,  9,  5,  0,  5,  0, -5,  5],
+         [-5, -3, -2,  8,  1, -2,  4, -7,  5,  3],
+         [-4,  4,  1, -4, -8,  2, -5,  2,  9, -7],
+         [ 9,  6, -8, -3,  3,  1,  4,  6, -5, -4],
+         [-2,  1,  5,  5,  2,  6,  7, -3, -7,  3]],
+
+        [[ 9, -8,  5, -3, -1,  2, -9, -5, -1, -3],
+         [-3,  3, -9, -7, -9, -8,  1, -3,  7, -2],
+         [ 8, -1,  8, -8, -7,  4,  8,  8,  5, -7],
+         [-1,  6, -8,  7, -1, -5, -8,  6, -2,  8],
+         [-5, -5,  8,  6,  0,  1,  3, -2, -3, -9]],
+
+        [[ 7, -2,  6, -8, -5,  3,  2, -1, -5,  8],
+         [-6, -4,  3,  9, -9, -8, -7,  3,  9,  0],
+         [ 1,  3,  4,  4, -5, -2, -4, -2,  3, -7],
+         [-6,  9,  5, -1,  7,  7,  8, -3, -8,  0],
+         [-1, -6, -3,  3,  3, -8, -4,  9, -5,  7]],
+
+        [[-5, -3, -9,  6, -1, -7,  9, -8,  1, -8],
+         [-8, -8, -2, -5, -7, -8,  1,  0,  0, -6],
+         [ 7, -5,  2,  2,  0, -9, -5, -7,  1,  8],
+         [-4,  0,  9,  6, -1, -6,  6, -6, -2, -1],
+         [ 7,  3,  0,  1,  1, -9,  5, -8, -1, -7]]])), kwargs={'beta': 0.6, 'alpha': 0.2}, broadcasts_input=False, name='')
+```
+
+And the `res` from torch is 
+
+```
+tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
+```
+
+So few observation is:
+1. Input tensor are of type int64
+2. alpha and beta are both floats.
+
+So one can suspect that it has to do with rounding.
+Reading the doc more carefully, we can find this sentence
+
+    For inputs of type FloatTensor or DoubleTensor, arguments beta and alpha must be real numbers, otherwise they should be integers.
+
+So likely torch first casted the float alpha and beta to integer, which yields 0, then used them in math to get a matrix with all zeros.
+
+### Second Impl
+
+```python
++@op(torch.ops.aten.addbmm.default)
++def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1):
++  alpha = jnp.array(alpha).astype(batch1.dtype)
++  beta = jnp.array(beta).astype(batch1.dtype)
++  mm = jnp.einsum('bxy, byz -> xz', batch1, batch2)
++  return jax.lax.cond(beta == 0,
++           lambda: alpha * mm,
++           lambda: beta*input + alpha*mm)
++
+```
+
+Adding type casts makes the tests passes.
+
+### Submit
+Now, let's remove the pdb and prints we added, and submit the fix as a PR: https://github.com/pytorch/xla/pull/6993
+
diff --git a/experimental/torch_xla2/docs/how_it_works.md b/experimental/torch_xla2/docs/how_it_works.md
new file mode 100644
index 00000000000..e4098ca0096
--- /dev/null
+++ b/experimental/torch_xla2/docs/how_it_works.md
@@ -0,0 +1,134 @@
+How it works
+============
+
+
+## Tensor subclass and eager mode
+
+The class `XLATensor2` is a `torch.Tensor` subclass
+that overrides `__torch_dispatch__`.
+
+It roughly looks like this (with some details removed):
+
+The complete class impl is at [tensor.py](../torch_xla2/tensor.py).
+
+```python
+class XLATensor2(torch.Tensor):
+
+  @staticmethod
+  def __new__(cls, elem):
+    return torch.Tensor._make_wrapper_subclass(
+        cls,
+        shape,
+        dtype=dtype,
+        device='meta',
+        requires_grad=False,
+    )
+
+  def __init__(self, elem: jax.Array):
+    super().__init__()
+    self._elem = elem
+
+  __torch_function__ = torch._C._disabled_torch_function_impl
+
+  @classmethod
+  def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+    # here assumes ALL tensors in args / kwargs are 
+    # instances of XLATensor2
+    args, kwargs = unwrap((args, kwargs))
+    jax_func = some_registry[func]
+    res = jax_func(*args, **kwargs)
+    return wrap(res)
+
+def wrap(tree):
+    # wrap jax.Array with XLATensor2
+    return pytree.tree_map_only(
+        jax.Array, XLATensor2, tree)
+
+def unwrap(tree):
+    # get jax.Array out ofXLATensor2
+    return pytree.tree_map_only(
+        XLATensor2, lambda x: x._elem, tree)
+```
+
+In other words, assuming that we have a function
+that takes `jax.Array` as input and returns `jax.Array`
+but otherwise implement the same semantics
+as a `ATen` op; then, using this tensor we would 
+be able to route the call to this jax function.
+
+[_ops.py](../torch_xla2/_ops.py) files defines some of those ops.
+
+Let's take `aten::add` as example:
+
+```python
+@op(torch.ops.aten.add)
+def _aten_add(x, y, *, alpha=1):
+  """if isinstance(x, jnp.ndarray) and isinstance(y, jnp.ndarray):
+
+  assert x.dtype == y.dtype, (x.dtype, y.dtype)
+  """
+  return x + y * alpha
+```
+
+The `@op` decorator just puts this function into `some_registry` dictionary.
+
+`_aten_add` has same signature as `torch.ops.aten.add` but takes `jax.Array` as
+input.
+
+![](dispatch.png)
+
+
+## fx Interpreter and dynamo mode
+
+Now, assuming we have this `some_registry` dict with key core Aten ops,
+and value the equivalent python Jax functions. We can also build a `fx.Interpreter`
+subclass that executes the jax function given a `fx.GraphModule`.
+
+
+```python
+class JaxInterpreter(torch.fx.Interpreter):
+
+  def call_function(self, target, args: Tuple, kwargs: Dict) -> Any:
+    if not isinstance(target,
+                      (torch._ops.OpOverloadPacket, torch._ops.OpOverload)):
+      return super().call_function(target, args, kwargs)
+
+    op = some_registry[target]
+    return op.func(*args, **kwargs)
+```
+
+There is no wrapping and unwrapping needed because `args` and `kwargs` are 
+already `jax.Array`'s.
+
+Using this interpreter we can build a dynamo backend:
+
+```python
+def backend(fxgraph):
+
+   def tojit(*args, *kwargs):
+    return JaxInterpreter(fxgraph).run(*args, **kwargs)
+   jitted = jax.jit(to_jit)
+
+   def f(*torchtensor):
+     jaxarrays = unwrap(torchtensors)
+     res = jitted(jax_array)
+     return wrap(res)
+
+   return f
+```
+
+The inner function `tojit` is a function that takes and returns 
+`jax.Array`'s. So it's suitable to be jitted with `jax.jit`.
+
+`f` is returned callable that takes `XLATensor2`; so can interop with 
+other torch codes.
+
+## nn.Modules and state management
+
+See [README.md](../README.md) for using `torch.func.functional_call` to 
+make `nn.Module`s interact well with `jax.jit`.
+
+See [Examples](../examples/README.md) for training using torch's optimizers or jax's
+optimizers.
+
+[def]: dispatch.png
\ No newline at end of file
diff --git a/experimental/torch_xla2/docs/ops_registry.md b/experimental/torch_xla2/docs/ops_registry.md
new file mode 100644
index 00000000000..c0e68f42fc4
--- /dev/null
+++ b/experimental/torch_xla2/docs/ops_registry.md
@@ -0,0 +1,40 @@
+# Ops Registry
+
+## Background
+
+In the [How it works](how_it_works.md) doc, we mentioned 2 important pieces:
+
+1. A mechanism to route `ATen` ops to implementation written in
+   Jax or in PyTorch, and
+
+2. The ops themselves.
+
+
+Ops Registry is there to help us to organize the ops themselves.
+
+An op implementation can written in terms of Jax, or in other PyTorch ops.
+The latter is also known as "decompositions". For decompositions, 
+one need to be careful of not introducing circular dependencies.
+
+Here we simply store the operator implementations in a dictionary,
+which key the torch / Aten callable that we wish to override, and
+value an instance of `Operator` class.
+
+`Operator` class has this schema:
+
+```python
+@dataclasses.dataclass
+class Operator:
+    torch_op: TorchCallable
+    func: Union[TorchCallable, JaxCallable]
+    is_jax_function: bool
+    is_user_defined: bool
+    needs_env: bool
+```
+
+The `torch_op` is the corresponding torch callable, and `func` the implementation. `is_jax_function` is True if `func` is implemented using Jax, False if `func` is implemented using other torch ops. We can use this information to decide how to call it.
+
+If `needs_env` is true, `func` will recieve an extra kwarg with name `env`.
+This will be the "Environment" in which this op operate on. In particular, 
+the environment will contain the Jax random number generator key, that might be useful for ops like `aten::rand`.
+
diff --git a/experimental/torch_xla2/examples/basic_training.py b/experimental/torch_xla2/examples/basic_training.py
index 5d3f5a734c5..29e55700a32 100644
--- a/experimental/torch_xla2/examples/basic_training.py
+++ b/experimental/torch_xla2/examples/basic_training.py
@@ -10,7 +10,11 @@
 from torch.utils import _pytree as pytree
 import torchvision
 import torchvision.transforms as transforms
-import torch_xla2
+import torch_xla2.tensor
+
+
+xla_env = torch_xla2.tensor.Environment(0)
+mode = xla_env.mode()
 
 # PyTorch TensorBoard support
 from torch.utils.tensorboard import SummaryWriter
@@ -80,6 +84,7 @@ def forward(self, x):
 
 
 model = GarmentClassifier()
+model = xla_env.to_xla(model)
 
 loss_fn = torch.nn.CrossEntropyLoss()
 
@@ -96,13 +101,6 @@ def forward(self, x):
 print('Total loss for this batch: {}'.format(loss.item()))
 
 # Optimizers specified in the torch.optim package
-
-# NEW: Move model to XLA device
-state_dict = model.state_dict()
-state_dict = pytree.tree_map_only(torch.Tensor,
-    torch_xla2.tensor.move_to_device, state_dict)
-model.load_state_dict(state_dict, strict=False, assign=True)
-
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 
 def train_one_epoch(epoch_index, tb_writer):
@@ -115,14 +113,14 @@ def train_one_epoch(epoch_index, tb_writer):
     for i, data in enumerate(training_loader):
         # Every data instance is an input + label pair
         # NEW: Move model to XLA device
-        data = pytree.tree_map_only(torch.Tensor, 
-                                    torch_xla2.tensor.move_to_device, data)
+        data = xla_env.to_xla(data)
         inputs, labels = data
 
         # Zero your gradients for every batch!
         optimizer.zero_grad()
 
         # Make predictions for this batch
+
         outputs = model(inputs)
 
         # Compute the loss and its gradients
@@ -169,14 +167,11 @@ def train_one_epoch(epoch_index, tb_writer):
     # Disable gradient computation and reduce memory consumption.
     with torch.no_grad():
         for i, vdata in enumerate(validation_loader):
-            # NOTE: move to XLA device
-            vinputs, vlabels = pytree.tree_map_only(
-                torch.Tensor, 
-                torch_xla2.tensor.move_to_device,
-                vdata)
-            voutputs = model(vinputs) # call model's forward
-            vloss = loss_fn(voutputs, vlabels)
-            running_vloss += vloss
+          # NOTE: move to XLA device
+          vinputs, vlabels = xla_env.to_xla(vdata)
+          voutputs = model(vinputs)  # call model's forward
+          vloss = loss_fn(voutputs, vlabels)
+          running_vloss += vloss
 
     avg_vloss = running_vloss / (i + 1)
     print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
diff --git a/experimental/torch_xla2/examples/basic_training_jax.py b/experimental/torch_xla2/examples/basic_training_jax.py
index 3941fcdf8fe..ae6efdf4856 100644
--- a/experimental/torch_xla2/examples/basic_training_jax.py
+++ b/experimental/torch_xla2/examples/basic_training_jax.py
@@ -8,7 +8,7 @@
 import torchvision
 import torchvision.transforms as transforms
 import torch_xla2
-import torch_xla2.extra
+import torch_xla2.interop
 import jax 
 import optax
 import numpy as np
@@ -91,7 +91,7 @@ def forward(self, x):
 
 def jax_loss(weights, data, label):
     pred = jax_func(weights, data)
-    loss = torch_xla2.extra.call_torch(loss_fn, pred, label)
+    loss = torch_xla2.interop.call_torch(loss_fn, pred, label)
     return loss
 
 grad_fn = jax.jit(jax.value_and_grad(jax_loss))
@@ -155,12 +155,6 @@ def train_one_epoch(jax_weights, opt_state, epoch_index, tb_writer):
     # Make sure gradient tracking is on, and do a pass over the data
     model.train(True)
 
-    # NEW: Move model to XLA device
-    state_dict = model.state_dict()
-    state_dict = pytree.tree_map_only(torch.Tensor,
-        torch_xla2.tensor.move_to_device, state_dict)
-    model.load_state_dict(state_dict, strict=False, assign=True)
-
     avg_loss, opt_state = train_one_epoch(jax_weights, opt_state, epoch_number, writer)
 
     running_vloss = 0.0
@@ -174,7 +168,7 @@ def train_one_epoch(jax_weights, opt_state, epoch_index, tb_writer):
 
             vinputs, vlabels = pytree.tree_map_only(torch.Tensor, torch_xla2.tensor.t2j, vdata)
             voutputs = jax_func(jax_weights, (vinputs, )) # call model's forward
-            vloss = torch_xla2.extra.call_torch(loss_fn, voutputs, vlabels)
+            vloss = torch_xla2.interop.call_torch(loss_fn, voutputs, vlabels)
             running_vloss += vloss
 
     avg_vloss = running_vloss / (i + 1)
diff --git a/experimental/torch_xla2/examples/eager_mode.py b/experimental/torch_xla2/examples/eager_mode.py
index 358ee6256c6..755f24b0d2b 100644
--- a/experimental/torch_xla2/examples/eager_mode.py
+++ b/experimental/torch_xla2/examples/eager_mode.py
@@ -1,10 +1,9 @@
-
-from torch_xla2.tensor import move_to_device
 import torch_xla2
 from torch import nn
 from torch.nn import functional as F
 import torch
-from torch.utils import _pytree as pytree
+
+xla_env = torch_xla2.default_env()
 
 
 class MyModel(nn.Module):
@@ -22,21 +21,21 @@ def forward(self, x):
         return x
 
 m = MyModel()
+m = xla_env.to_xla(m)
 
 # Execute this model using torch
 inputs = (torch.randn(3, 3, 28, 28), )
+inputs = xla_env.to_xla(inputs)
 
-inputs, state_dict = pytree.tree_map_only(torch.Tensor, move_to_device, (inputs, m.state_dict()))
-m.load_state_dict(state_dict, strict=False, assign=True)
 print(m(*inputs))
 print('---=====')
 
-from torch_xla2.extra import jax_jit
+from torch_xla2.interop import jax_jit
 
 @jax_jit
 def model_func(param, inputs):
   return torch.func.functional_call(m, param, inputs)
 
-print(model_func(state_dict, inputs))
+print(model_func(m.state_dict(), inputs))
 
 
diff --git a/experimental/torch_xla2/examples/lightning_training.py b/experimental/torch_xla2/examples/lightning_training.py
new file mode 100644
index 00000000000..b09f00d9473
--- /dev/null
+++ b/experimental/torch_xla2/examples/lightning_training.py
@@ -0,0 +1,77 @@
+import os, torch, torch.nn as nn, torch.utils.data as data, torchvision as tv
+import lightning as L
+
+encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3))
+decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28))
+
+class LitAutoEncoder(L.LightningModule):
+    def __init__(self, encoder, decoder):
+        super().__init__()
+        self.encoder, self.decoder = encoder, decoder
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        x = x.view(x.size(0), -1)
+        z = self.encoder(x)
+        x_hat = self.decoder(z)
+        loss = nn.functional.mse_loss(x_hat, x)
+        self.log("train_loss", loss)
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=1e-3)
+
+dataset = tv.datasets.MNIST(".", download=True, transform=tv.transforms.ToTensor())
+
+# Lightning will automatically use all available GPUs!
+trainer = L.Trainer()
+# trainer.fit(LitAutoEncoder(encoder, decoder), data.DataLoader(dataset, batch_size=64))
+
+# ==== above is the lightning example from 
+# https://lightning.ai/pytorch-lightning
+
+import torch_xla2
+from torch_xla2.interop import jax_view, torch_view
+import jax
+import optax
+
+class JaxTrainer:
+
+    def __init__(self):
+        pass
+
+    def torch_opt_to_jax_opt(self, torch_opt):
+        # TODO: Can convert optimizer instead of using a jax one
+        return optax.adam(0.001)
+
+    def fit(self, lightning_mod, data_loader):
+
+        xla_env = torch_xla2.default_env()
+
+        def lightning_mod_loss(
+            weights: jax.Array, data: jax.Array, batch_id):
+            """returns loss"""
+            weights, data = torch_view((weights, data))
+            lightning_mod.load_state_dict(weights, assign=True)
+            with xla_env:
+                loss = lightning_mod.training_step(data, batch_id)
+            return jax_view(loss)
+
+        jax_weights = jax_view(xla_env.to_xla(lightning_mod.state_dict()))
+        jax_optimizer = self.torch_opt_to_jax_opt(
+            lightning_mod.configure_optimizers())
+        opt_state = jax_optimizer.init(jax_weights)
+        grad_fn = jax.jit(jax.value_and_grad(lightning_mod_loss))
+
+        for bid in range(3):
+            for item in data_loader:
+                xla_data = jax_view(xla_env.to_xla(item))
+                loss, grads = grad_fn(jax_weights, xla_data, bid)
+                updates, opt_state = jax_optimizer.update(grads, opt_state)
+                jax_weights = optax.apply_updates(jax_weights, updates)
+                print('current_loss', loss)
+
+
+print('-----------------')
+trainer_jax = JaxTrainer()
+trainer_jax.fit(LitAutoEncoder(encoder, decoder), data.DataLoader(dataset, batch_size=64))
diff --git a/experimental/torch_xla2/pyproject.toml b/experimental/torch_xla2/pyproject.toml
index d0d2a42dec8..14b77ad0216 100644
--- a/experimental/torch_xla2/pyproject.toml
+++ b/experimental/torch_xla2/pyproject.toml
@@ -2,29 +2,29 @@
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
-
 [project]
 version = "0.0.1"
 name = "torch_xla2"
 dependencies = [
     "absl-py",
-    "flatbuffers",
+    "immutabledict",
     "pytest",
-    "tensorflow",
-
-    # Note: Exclude these because otherwise on pip install .
-    # pip will install libs from pypi which is the GPU version
-    # of these libs.
-    # We most likely need CPU version of torch and TPU version of 
-    # jax. So it's best for users to install them by hand
-    # See more at README.md
-    # "jax>=0.4.24",
-    # "jaxlib>=0.4.24",
-    # "torch",
+    "tensorflow-cpu",
+    # Developers should install `dev-requirements.txt` first
+    "torch>=2.2.1",
 ]
-
 requires-python = ">=3.10"
 license = {file = "LICENSE"}
 
+[project.optional-dependencies]
+cpu = ["jax[cpu]>=0.4.24", "jax[cpu]"]
+# Add libtpu index `-f https://storage.googleapis.com/libtpu-releases/index.html`
+tpu = ["jax[cpu]>=0.4.24", "jax[tpu]"]
+cuda = ["jax[cpu]>=0.4.24", "jax[cuda12]"]
+
 [tool.pytest.ini_options]
 addopts="-n auto"
+
+[tool.ruff]
+line-length = 80
+indent-width = 2
diff --git a/experimental/torch_xla2/test-requirements.txt b/experimental/torch_xla2/test-requirements.txt
new file mode 100644
index 00000000000..1deead455a1
--- /dev/null
+++ b/experimental/torch_xla2/test-requirements.txt
@@ -0,0 +1,5 @@
+-r dev-requirements.txt
+pytest
+pytest-xdist
+sentencepiece
+expecttest
diff --git a/experimental/torch_xla2/test/gemma/test_gemma.py b/experimental/torch_xla2/test/gemma/test_gemma.py
index bd0bb21dbb1..4d91bc6f9b0 100644
--- a/experimental/torch_xla2/test/gemma/test_gemma.py
+++ b/experimental/torch_xla2/test/gemma/test_gemma.py
@@ -74,7 +74,7 @@ def test_gemma(self):
 
         weights, jax_func = torch_xla2.extract_jax(model)
         inputs_jax = pytree.tree_map_only(
-            torch.Tensor, torch_xla2.tensor.move_to_device, inputs)
+            torch.Tensor, torch_xla2.tensor.t2j, inputs)
 
         import jax
         print(jax.jit(jax_func)(weights, inputs_jax))
diff --git a/experimental/torch_xla2/test/llama/test_llama.py b/experimental/torch_xla2/test/llama/test_llama.py
index dae7bf0cc5c..083116ab89e 100644
--- a/experimental/torch_xla2/test/llama/test_llama.py
+++ b/experimental/torch_xla2/test/llama/test_llama.py
@@ -1,8 +1,5 @@
-import unittest
-import jax
 import torch
-from torch._functorch.make_functional import make_functional_with_buffers
-from torch_xla2 import tensor, ops  # pylint: disable=unused-import
+from torch_xla2 import tensor  # pylint: disable=unused-import
 import torch_xla2
 
 from .. import test_base
diff --git a/experimental/torch_xla2/test/moe/__init__.py b/experimental/torch_xla2/test/moe/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/experimental/torch_xla2/test/moe/model.py b/experimental/torch_xla2/test/moe/model.py
new file mode 100644
index 00000000000..9249ac9dce0
--- /dev/null
+++ b/experimental/torch_xla2/test/moe/model.py
@@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    num_experts: int = 8
+    num_activated_experts: int = 2
+
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+
+    @classmethod
+    def from_name(cls, name: str):
+        if name in transformer_configs:
+            return cls(**transformer_configs[name])
+        # fuzzy search
+        config = [config for config in transformer_configs if config in str(name).upper() or config in str(name)]
+        assert len(config) == 1, name
+        return cls(**transformer_configs[config[0]])
+
+
+transformer_configs = {
+    "Mixtral-8x7B-v0.1": dict(block_size=32768, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, rope_base=1000000.0, num_experts=8, num_activated_experts=2),
+}
+
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer('k_cache', torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer('v_cache', torch.zeros(cache_shape, dtype=dtype))
+
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+
+        return k_out, v_out
+
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList(TransformerBlock(config) for _ in range(config.n_layer))
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+
+    def setup_caches(self, max_batch_size, max_seq_length):
+        if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size:
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim)
+
+        self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base)
+        self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool))
+
+    def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        assert self.freqs_cis is not None, "Caches must be initialized first"
+        mask = self.causal_mask[None, None, input_pos]
+        freqs_cis = self.freqs_cis[input_pos]
+        x = self.tok_embeddings(idx)
+
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, freqs_cis, mask)
+        x = self.norm(x)
+        logits = self.output(x)
+        return logits
+
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(ModelArgs.from_name(name))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.block_sparse_moe = MOEFeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+
+    def forward(self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.block_sparse_moe(self.ffn_norm(h))
+        return out
+
+
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+
+    def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+        y = self.wo(y)
+        return y
+
+
+class ConditionalFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim))
+        self.w2 = nn.Parameter(torch.empty(config.num_experts, config.dim, config.intermediate_size))
+        self.w3 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim))
+
+    def forward(self, x: Tensor, expert_indices: Tensor) -> Tensor:
+        w1_weights = self.w1[expert_indices] # [T, A, D, D]
+        w3_weights = self.w3[expert_indices] # [T, A, D, D]
+        w2_weights = self.w2[expert_indices]  # [T, A, D, D]
+        x1 = F.silu(torch.einsum('ti,taoi -> tao', x, w1_weights))
+        x3 = torch.einsum('ti, taoi -> tao', x, w3_weights)
+        expert_outs =  torch.einsum('tao, taio -> tai', (x1 * x3), w2_weights)
+        return expert_outs
+
+
+class MOEFeedForward(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.dim, config.num_experts, bias=False)
+        self.cond_ffn = ConditionalFeedForward(config)
+        self.dim = config.dim
+        self.num_activated_experts = config.num_activated_experts
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.view(-1, self.dim)
+        # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts
+        # x: [T, D]
+        scores = self.gate(x) # [T, E]
+        expert_weights = F.softmax(scores, dim=-1)
+        expert_weights, expert_indices = torch.topk(expert_weights, self.num_activated_experts, dim=-1) # [T, A], [T, A]
+        expert_weights /= expert_weights.sum(dim=-1, keepdim=True) # [T, A]
+        expert_outs = self.cond_ffn(x, expert_indices)
+        return torch.einsum('tai,ta -> ti', expert_outs, expert_weights)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(
+    seq_len: int, n_elem: int, base: int = 10000
+) -> Tensor:
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+
+
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
diff --git a/experimental/torch_xla2/test/moe/moe_test.py b/experimental/torch_xla2/test/moe/moe_test.py
new file mode 100644
index 00000000000..f8d4a22e3f2
--- /dev/null
+++ b/experimental/torch_xla2/test/moe/moe_test.py
@@ -0,0 +1,75 @@
+import torch_xla2
+import torch_xla2.interop
+import torch
+import unittest
+import jax
+
+
+from test.moe import model
+
+
+class TestMoe(unittest.TestCase):
+
+    def _make_tiny_config(self):
+        return model.ModelArgs(
+            block_size = 128,
+            vocab_size = 32000,
+            n_layer = 4,
+            n_head = 4,
+            dim = 128,
+            intermediate_size = None,
+            n_local_heads = -1,
+            head_dim = 32,
+            rope_base = 10000,
+            norm_eps = 1e-5,
+            num_experts = 8,
+            num_activated_experts = 2,
+        )
+
+    def _random_init(self, model):
+        new_state_dict = {}
+
+        for k, v in model.state_dict().items():
+            new_state_dict[k] = torch.randn_like(v)
+        
+        model.load_state_dict(new_state_dict, assign=True)
+        return model
+
+
+
+    def test_moe_layer(self):
+        model_args = self._make_tiny_config()
+
+        moe_layer = model.MOEFeedForward(model_args)
+        moe_layer = self._random_init(moe_layer)
+        seqlen = 32
+        x = torch.randn((seqlen, model_args.dim))
+        res = moe_layer(x)
+
+        env = torch_xla2.default_env()
+        model_xla = env.to_xla(moe_layer)
+        x_xla = env.to_xla(x)
+        with jax.default_matmul_precision('float32'):
+            res_xla = model_xla(x_xla)
+        res2 = torch_xla2.tensor.j2t(res_xla._elem)
+        print('max diff', torch.max((res - res2).abs()))
+
+        self.assertTrue( 
+            torch.allclose(res2, res, atol=1e-2))
+
+        # test can jit
+
+        def f(weights, x):
+            return torch.func.functional_call(moe_layer, weights, (x, ))
+
+        fjitted = torch_xla2.interop.jax_jit(f)
+        weights_xla = env.to_xla(moe_layer.state_dict())
+
+        print(fjitted(weights_xla, x_xla))
+
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/experimental/torch_xla2/test/test_context.py b/experimental/torch_xla2/test/test_context.py
index 1a75a7d23d0..a6bcda5113a 100644
--- a/experimental/torch_xla2/test/test_context.py
+++ b/experimental/torch_xla2/test/test_context.py
@@ -1,20 +1,22 @@
 import unittest
 
 import torch
-import torch_xla2
 from torch_xla2 import tensor
 
+xla_env = tensor.Environment(0)
+
 
 class TestContext(unittest.TestCase):
+
   def test_mode_context_manager(self):
-    with torch_xla2.mode():
+    with xla_env:
       x = torch.full((3, 3), -1)
       self.assertIsInstance(x, tensor.XLATensor2)
       y = x.abs()
       self.assertIsInstance(y, tensor.XLATensor2)
 
   @staticmethod
-  @torch_xla2.mode()
+  @xla_env
   def _test_mode_decorator():
     x = torch.full((3, 3), -1)
     y = x.abs()
diff --git a/experimental/torch_xla2/test/test_core_aten_ops.py b/experimental/torch_xla2/test/test_core_aten_ops.py
index 357e41c9101..c11884fa370 100644
--- a/experimental/torch_xla2/test/test_core_aten_ops.py
+++ b/experimental/torch_xla2/test/test_core_aten_ops.py
@@ -1,7 +1,6 @@
 import unittest
 
 import torch
-from torch_xla2 import ops_registry
 from torch_xla2 import tensor
 
 from . import test_base
@@ -34,12 +33,13 @@ def run_export_and_compare(testcase,
                            rtol=1e-5,
                            equal_nan=True,
                            ignore_indices=False):
+  
   with testcase.subTest("torch_eval"):
     res = func(*args, **kwargs)
     with testcase.subTest("torch_xla2_eval"):
-      args2, kwargs2 = pytree.tree_map_only(torch.Tensor, tensor.move_to_device,
-                                            (args, kwargs))
-      res2 = func(*args2, **kwargs2)
+      args2, kwargs2 = testcase.env.to_xla((args, kwargs))
+      with testcase.env:
+        res2 = func(*args2, **kwargs2)
       res2 = pytree.tree_map_only(tensor.XLATensor2, lambda t: t.torch(), res2)
       # import pdb; pdb.set_trace()
       with testcase.subTest("torch_xla2_diff:" + str(atol)):
@@ -61,11 +61,11 @@ class TestCoreAtenOps(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
     super().setUpClass()
-    ops_registry.print_missing_ops()
 
   def setUp(self):
     super().setUp()
     torch.manual_seed(0)
+    self.env = tensor.Environment(0)
 
   def test_aten_abs_0(self):
     args = (torch.randn((10, 10)).to(torch.float32),)
@@ -2109,7 +2109,7 @@ def test_aten_logit_0(self):
   def test_aten_logit_1(self):
     args = (torch.randn((10, 10)).to(torch.float16),)
     kwargs = dict()
-    run_export_and_compare(self, torch.ops.aten.logit, args, kwargs)
+    run_export_and_compare(self, torch.ops.aten.logit, args, kwargs, atol=0.01,)
 
   def test_aten_logit_2(self):
     args = (torch.randint(0, 10, (10, 10)).to(torch.int32),)
@@ -2697,6 +2697,117 @@ def test_aten_native_layer_norm_0(self):
     kwargs = dict()
     run_export_and_compare(self, torch.ops.aten.native_layer_norm, args, kwargs)
 
+  def test_aten_native_batch_norm_legit(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,2,2)).to(torch.float32),
+        torch.ones(channel),
+        torch.zeros(channel),
+        torch.zeros(channel),
+        torch.ones(channel),
+        False,
+        0.5,
+        1,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten._native_batch_norm_legit, args, kwargs)
+
+  def test_aten_native_batch_norm_legit_none(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,4)).to(torch.float32),
+        None,
+        None,
+        torch.ones(channel),
+        torch.zeros(channel),
+        False,
+        0.5,
+        1,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten._native_batch_norm_legit, args, kwargs)
+
+  def test_aten_native_batch_norm_legit_training_none(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,3)).to(torch.float32),
+        None,
+        None,
+        torch.zeros(channel),
+        torch.ones(channel),
+        True,
+        0.2,
+        2e-5,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten._native_batch_norm_legit, args, kwargs)
+
+  def test_aten_native_batch_norm_legit_no_training(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,3)).to(torch.float32),
+        torch.ones(channel),
+        torch.zeros(channel),
+        torch.zeros(channel),
+        torch.ones(channel),
+        0.2,
+        2e-5,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten._native_batch_norm_legit_no_training, args, kwargs)
+
+  def test_aten_native_batch_norm_training(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,3)).to(torch.float32),
+        torch.ones(channel),
+        torch.zeros(channel),
+        torch.zeros(channel),
+        torch.ones(channel),
+        True,
+        0.1,
+        1e-5,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten.native_batch_norm, args, kwargs)
+
+  def test_aten_native_batch_norm_training_none(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,3)).to(torch.float32),
+        None,
+        None,
+        torch.zeros(channel),
+        torch.ones(channel),
+        True,
+        0.1,
+        1e-5,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten.native_batch_norm, args, kwargs)
+
+  def test_aten_native_batch_norm_eval(self):
+    batch = 3
+    channel = 2
+    args = (
+        torch.randn((batch,channel,4,3)).to(torch.float32),
+        torch.ones(channel),
+        torch.zeros(channel),
+        torch.zeros(channel),
+        torch.ones(channel),
+        False,
+        0.2,
+        2e-5,
+    )
+    kwargs = dict()
+    run_export_and_compare(self, torch.ops.aten.native_batch_norm, args, kwargs)
+
   def test_aten_ne_Scalar_0(self):
     args = (
         torch.randint(0, 10, (10, 10)).to(torch.int32),
@@ -3639,8 +3750,9 @@ def test_aten__softmax_1(self):
   def _compare_sorted_result(self, args):
     res = torch.ops.aten.sort(*args)
     with self.subTest("torch_xla2_eval"):
-      args2 = pytree.tree_map_only(torch.Tensor, tensor.move_to_device, args)
-      res2 = torch.ops.aten.sort(*args2)
+      args2 = self.env.to_xla(args)
+      with self.env:
+        res2 = torch.ops.aten.sort(*args2)
 
     # The second argument is the sorted index. These might not be
     # identical from torch vs. jax; but both can be correct
diff --git a/experimental/torch_xla2/test/test_exports.py b/experimental/torch_xla2/test/test_exports.py
index 81e91452c02..ce465324a4c 100644
--- a/experimental/torch_xla2/test/test_exports.py
+++ b/experimental/torch_xla2/test/test_exports.py
@@ -34,33 +34,109 @@ def setUp(self):
 
   def test_interpolate(self):
 
+    # Check Accuracy
     arg = (torch.randn(3, 3, 200, 200),)
     model = Interpolate()
-
     ans = model(*arg)
 
     with torch.no_grad():
       exported = torch.export.export(model, arg)
-      weights, func = torch_xla2.export.exported_program_to_jax(exported)
-      argj = tensor.t2j(arg[0])
-      ans2 = jax.jit(func)(weights, (argj,))[0]
-      ans2 = tensor.j2t(ans2)
-      self.assertTrue(torch.allclose(ans, ans2, atol=1e-3))
+    weights, func = torch_xla2.export.exported_program_to_jax(exported)
+    argj = tensor.t2j(arg[0])
+    ans2 = jax.jit(func)(weights, (argj,))[0]
+    ans2 = tensor.j2t(ans2)
+    self.assertTrue(torch.allclose(ans, ans2, atol=1e-3))
+
+    # Convert to StableHLO
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+    self.assertIn("func.func public @main", module_str)
+    self.assertIn("func.func private @clip(%arg0: tensor<500xf32>", module_str)
+    self.assertIn("stablehlo.minimum", module_str)
 
   def test_constant(self):
 
+    # Check Accuracy
     arg = (torch.randn(10, 10),)
     model = TensorConstant()
-
     ans = model(*arg)
 
     with torch.no_grad():
       exported = torch.export.export(model, arg)
-      weights, func = torch_xla2.export.exported_program_to_jax(exported)
-      argj = tensor.t2j(arg[0])
-      ans2 = jax.jit(func)(weights, (argj,))[0]
-      ans2 = tensor.j2t(ans2)
-      self.assertTrue(torch.allclose(ans, ans2, atol=1e-5))
+
+    weights, func = torch_xla2.export.exported_program_to_jax(exported)
+    argj = tensor.t2j(arg[0])
+    ans2 = jax.jit(func)(weights, (argj,))[0]
+    ans2 = tensor.j2t(ans2)
+    self.assertTrue(torch.allclose(ans, ans2, atol=1e-5))
+
+    # Convert to StableHLO
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+    self.assertIn("func.func public @main", module_str)
+    self.assertIn("stablehlo.divide", module_str)
+
+  def test_interpolate_dynamic(self):
+    # Export with dynamic dimension constraints on both min and max
+    arg = (torch.randn(3, 3, 200, 200),)
+    model = Interpolate()
+    ans = model(*arg)
+    dynamic_shapes = ({0:  torch.export.Dim("b", min=3, max=10)},)
+
+    with torch.no_grad():
+      exported = torch.export.export(model, arg, dynamic_shapes=dynamic_shapes)
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+
+    # Look for dynamic shape artifacts
+    self.assertIn("func.func public @main(%arg0: tensor<?x3x200x200xf32>", module_str)
+    self.assertIn("stablehlo.dynamic_broadcast_in_dim", module_str)
+    self.assertIn("stablehlo.dynamic_gather", module_str)
+
+  def test_export_dtypes(self):
+    DTYPE_TO_MLIR_STR = {
+      # NO_MAPPING        : jnp.float0 (signless scalar int)
+      torch.bool          : "i1",
+      # NO_MAPPING        : "i4"
+      torch.int8          : "i8",
+      torch.int16         : "i16",
+      torch.int32         : "i32",
+      torch.int64         : "i64",
+      torch.long          : "i64",
+      # NO_MAPPING        : "ui4"
+      torch.uint8         : "ui8",
+      torch.uint16        : "ui16",
+      torch.uint32        : "ui32",
+      torch.uint64        : "ui64",
+      # NO_MAPPING        : "f8E4M3B11FNUZ"
+      torch.float8_e4m3fn : "f8E4M3FN",
+      # NO_MAPPING        : f8E4M3FNUZ
+      torch.float8_e5m2   : "f8E5M2",
+      # NO_MAPPING        : f8E5M2FNUZ
+      torch.bfloat16      : "bf16",
+      torch.half          : "f16",
+      torch.float16       : "f16",
+      torch.float32       : "f32",
+      torch.float64       : "f64",
+      torch.double        : "f64",
+      torch.complex64     : "complex<f32>",
+      torch.complex128    : "complex<f64>",
+      None                : None,
+    }
+
+    model = TensorConstant()
+    for torch_dtype in torch_xla2.tensor.TORCH_DTYPE_TO_JAX.keys():
+      if torch_dtype == None:
+        ## TODO: Figure out what the None mapping should be, seems like:
+        ##   torch.tensor(dtype=None) maps to f32
+        ##   jnp.tensor(dtype=None) maps to f64
+        continue
+      arg = (torch.randn(10).to(torch_dtype),)
+      with torch.no_grad():
+        exported = torch.export.export(model, arg)
+      stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+      module_str = str(stablehlo.mlir_module())
+      self.assertIn(DTYPE_TO_MLIR_STR[torch_dtype], module_str)
 
 
 if __name__ == '__main__':
diff --git a/experimental/torch_xla2/test/test_extra.py b/experimental/torch_xla2/test/test_extra.py
deleted file mode 100644
index 768488d6a99..00000000000
--- a/experimental/torch_xla2/test/test_extra.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import unittest
-import torch
-import torch.nn.functional as F
-import jax
-import jax.numpy as jnp
-import torch_xla2
-from torch_xla2 import tensor, extra
-
-
-class ExtraTest(unittest.TestCase):
-
-  def setUp(self):
-    torch.manual_seed(0)
-
-  def test_standard_callable(self):
-    def f(a, b):
-      return torch.add(a, b)
-
-    a = jnp.ones((10, ))
-    b = jnp.ones((10, ))
-
-    c = extra.jax_view(f)(a, b)
-    self.assertTrue(jnp.allclose(c, a + b))
-
-    def f2(a, b):
-      return jnp.add(a, b)
-
-    a = tensor.move_to_device(torch.ones((10, )))
-    b = tensor.move_to_device(torch.ones((10, )))
-    c2 = extra.torch_view(f2)(a, b)
-
-    self.assertTrue(jnp.allclose(c2._elem, c))
-
-
-
-  def test_fori_loop(self):
-    a = tensor.move_to_device(torch.ones((10, 10)))
-
-    def body(i, c):
-      return c + a[i]
-
-    init_val = tensor.move_to_device(torch.zeros(10))
-    res = extra.fori_loop(0, 10, body, init_val)
-    expect = torch.ones(10) * 10
-    self.assertTrue(torch.allclose(tensor.j2t(res._elem), expect))
-
-  def test_jax_jit(self):
-
-    # functions that acts on torch tensor
-    def f(a, b):
-      return torch.sin(a) + torch.cos(b)
-    
-    fjitted = extra.jax_jit(f)
-    a = torch.rand((10, 10))
-    b = torch.rand((10, 10))
-    aj = tensor.move_to_device(a)
-    bj = tensor.move_to_device(b)
-    res = f(a, b)
-    res2 = fjitted(aj, bj)
-    self.assertTrue(torch.allclose(res, tensor.j2t(res2._elem)))
-
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/experimental/torch_xla2/test/test_functions.py b/experimental/torch_xla2/test/test_functions.py
index 76e842d6fdd..2d624b25b5b 100644
--- a/experimental/torch_xla2/test/test_functions.py
+++ b/experimental/torch_xla2/test/test_functions.py
@@ -3,12 +3,14 @@
 from absl.testing import parameterized
 import torch
 import torch_xla2
-import torch_xla2.functions
 import torch_xla2.tensor
 
 
 class TestTorchFunctions(parameterized.TestCase):
 
+  def setUp(self):
+    self.env = torch_xla2.tensor.Environment(0)
+
   @parameterized.named_parameters(
       ('tensor_2d', lambda: torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])),
       ('tensor_1d', lambda: torch.tensor([0, 1],)),
@@ -32,7 +34,7 @@ class TestTorchFunctions(parameterized.TestCase):
   def test_tensor_constructor(self, func: Callable[[], torch.Tensor]):
     expected = func()
 
-    with torch_xla2.functions.XLAFunctionMode():
+    with self.env:
       actual = func()
       self.assertIsInstance(actual, torch_xla2.tensor.XLATensor2)
 
diff --git a/experimental/torch_xla2/test/test_mutations.py b/experimental/torch_xla2/test/test_mutations.py
index 2f9ddca975b..50d78aa0fae 100644
--- a/experimental/torch_xla2/test/test_mutations.py
+++ b/experimental/torch_xla2/test/test_mutations.py
@@ -6,46 +6,43 @@
 
 class TestMutations(TestCase):
 
-  def test_add(self):
-    x = torch.tensor([1, 2, 3], dtype=torch.int32)
-    y = torch.tensor([4, 5, 6], dtype=torch.int32)
+  def setUp(self):
+    self.env = torch_xla2.tensor.Environment(0)
 
-    x = torch_xla2.tensor.move_to_device(x)
-    y = torch_xla2.tensor.move_to_device(y)
-    x.add_(y)
-    xt = torch_xla2.tensor.j2t(x._elem)
-    self.assertEqual(xt, torch.tensor([5, 7, 9], dtype=torch.int32))
+  def test_add(self):
+    with self.env:
+      x = torch.tensor([1, 2, 3], dtype=torch.int32)
+      y = torch.tensor([4, 5, 6], dtype=torch.int32)
+      x.add_(y)
+      xt = torch_xla2.tensor.j2t(x._elem)
+      self.assertEqual(xt, torch.tensor([5, 7, 9], dtype=torch.int32))
 
   def test_sub(self):
-    x = torch.tensor([1, 2, 3], dtype=torch.int32)
-    y = torch.tensor([4, 5, 6], dtype=torch.int32)
-
-    x = torch_xla2.tensor.move_to_device(x)
-    y = torch_xla2.tensor.move_to_device(y)
-    x.sub_(y)
-    xt = torch_xla2.tensor.j2t(x._elem)
-    self.assertEqual(xt, torch.tensor([-3, -3, -3], dtype=torch.int32))
+    with self.env:
+      x = torch.tensor([1, 2, 3], dtype=torch.int32)
+      y = torch.tensor([4, 5, 6], dtype=torch.int32)
+      x.sub_(y)
+      xt = torch_xla2.tensor.j2t(x._elem)
+      self.assertEqual(xt, torch.tensor([-3, -3, -3], dtype=torch.int32))
 
   def test_mul(self):
-    x = torch.tensor([1, 2, 3], dtype=torch.int32)
-    y = torch.tensor([4, 5, 6], dtype=torch.int32)
+    with self.env:
+      x = torch.tensor([1, 2, 3], dtype=torch.int32)
+      y = torch.tensor([4, 5, 6], dtype=torch.int32)
 
-    x = torch_xla2.tensor.move_to_device(x)
-    y = torch_xla2.tensor.move_to_device(y)
-    x.mul_(y)
-    xt = torch_xla2.tensor.j2t(x._elem)
-    self.assertEqual(xt, torch.tensor([4, 10, 18], dtype=torch.int32))
+      x.mul_(y)
+      xt = torch_xla2.tensor.j2t(x._elem)
+      self.assertEqual(xt, torch.tensor([4, 10, 18], dtype=torch.int32))
 
   def test_div(self):
-    x = torch.tensor([1, 2, 3], dtype=torch.int32)
-    y = torch.tensor([4, 5, 6], dtype=torch.int32)
-
-    x = torch_xla2.tensor.move_to_device(x)
-    y = torch_xla2.tensor.move_to_device(y)
-    x.div_(y)
-    xt = torch_xla2.tensor.j2t(x._elem)
-    self.assertEqual(xt,
-                     torch.tensor([1. / 4, 2. / 5, 3. / 6], dtype=torch.float))
+    with self.env:
+      x = torch.tensor([1, 2, 3], dtype=torch.int32)
+      y = torch.tensor([4, 5, 6], dtype=torch.int32)
+
+      x.div_(y)
+      xt = torch_xla2.tensor.j2t(x._elem)
+      self.assertEqual(xt,
+                      torch.tensor([1. / 4, 2. / 5, 3. / 6], dtype=torch.float))
 
 
 if __name__ == '__main__':
diff --git a/experimental/torch_xla2/test/test_ops.py b/experimental/torch_xla2/test/test_ops.py
index ed14e636e5c..1e10706f100 100644
--- a/experimental/torch_xla2/test/test_ops.py
+++ b/experimental/torch_xla2/test/test_ops.py
@@ -8,26 +8,13 @@
 from torch.utils import _pytree as pytree
 from torch_xla2 import tensor
 
+
 skiplist = {
     "__getitem__",
     "__rmatmul__",
     "__rpow__",
-    "_native_batch_norm_legit",
     "_segment_reduce",
     "_upsample_bilinear2d_aa",
-    "addbmm",
-    "addmm",
-    "addmv",
-    "addr",
-    "all",
-    "allclose",
-    "amax",
-    "amin",
-    "aminmax",
-    "angle",
-    "any",
-    "argmax",
-    "argmin",
     "argsort",
     "as_strided",
     "as_strided_scatter",
@@ -210,7 +197,6 @@
     "nansum",
     "narrow_copy",
     "narrow",
-    "native_batch_norm",
     "native_layer_norm",
     "new_empty",
     "new_empty_strided",
@@ -570,6 +556,7 @@
     "special.xlog1py",
     "split",
     "split_with_sizes",
+    "split_with_sizes_copy",
     "sqrt",
     "square",
     "stack",
@@ -636,10 +623,10 @@ def run_export_and_compare(testcase,
   with testcase.subTest("torch_eval"):
     res = func(sample_input.input, *sample_input.args, **sample_input.kwargs)
     with testcase.subTest("torch_xla2_eval"):
-      input2, args2, kwargs2 = pytree.tree_map_only(
-          torch.Tensor, tensor.move_to_device,
-          (sample_input.input, sample_input.args, sample_input.kwargs))
-      res2 = func(input2, *args2, **kwargs2)
+      input2, args2, kwargs2 = testcase.env.to_xla((
+        sample_input.input, sample_input.args, sample_input.kwargs))
+      with testcase.env:
+        res2 = func(input2, *args2, **kwargs2)
       res2 = pytree.tree_map_only(tensor.XLATensor2, lambda t: t.torch(), res2)
       with testcase.subTest("torch_xla2_diff:" + str(atol)):
         if ignore_indices and isinstance(res, tuple) and len(res) == 2:
@@ -664,6 +651,9 @@ class TestOpInfo(TestCase):
   def setUpClass(cls):
     print('op_db size: ', len(op_db), 'testing: ', len(ops_to_test))
 
+  def setUp(self):
+    self.env = tensor.Environment(0)
+
   @ops(ops_to_test, allowed_dtypes=(torch.float32, torch.long))
   def test_reference_eager(self, device, dtype, op):
     sample_inputs = op.sample_inputs(device, dtype)
diff --git a/experimental/torch_xla2/test/test_symbolic_shapes.py b/experimental/torch_xla2/test/test_symbolic_shapes.py
new file mode 100644
index 00000000000..42e9778893a
--- /dev/null
+++ b/experimental/torch_xla2/test/test_symbolic_shapes.py
@@ -0,0 +1,92 @@
+import unittest
+import torch
+import jax
+import torch_xla2
+
+class AddOne(torch.nn.Module):
+
+  def __init__(self):
+    super().__init__()
+
+  def forward(self, a):
+    return a + 1
+
+class ConcatAddModel(torch.nn.Module):
+  def __init__(self):
+    super().__init__()
+
+  def forward(self, a, b):
+    a = torch.concat([a, a], dim=0)
+    return a + b
+
+class SymbolicShapeTest(unittest.TestCase):
+  """Test possible symbolic shape computations that upstream torch export can
+  emit. Seems to be currently limited to a few binary math operations where one
+  operand is a symbolic variable/expr and the other is a constant integer.
+  """
+
+  def setUp(self):
+    torch.manual_seed(0)
+
+  def test_constraints_min_max(self):
+    """Test a model with basic min/max dimension restrictions 
+    """
+
+    # Arg shapes are a=s0{<=10}, b=s0*2
+    model = AddOne()
+    args = (torch.rand(5),)
+    sym_a = torch.export.Dim("a", min=3, max=10)
+    dynamic_shapes = ({0: sym_a},)
+
+    with torch.no_grad():
+      exported = torch.export.export(model, args=args, dynamic_shapes=dynamic_shapes)
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+
+    self.assertRegex(module_str, r"stablehlo.constant.*3")
+    self.assertRegex(module_str, r"shape_assertion.*s[0-9]+ >= 3")
+    self.assertRegex(module_str, r"stablehlo.constant.*10")
+    self.assertRegex(module_str, r"shape_assertion.*s[0-9]+ <= 10")
+
+  def test_constraints_multiply(self):
+    """Test a model with a slightly more complex constraint, where the input
+    shapes are determined by an equation of the other, in this case input shapes
+    are s0{<=10} and s0*2.
+    """
+    # Arg shapes are a=s0{<=10}, b=s0*2
+    model = ConcatAddModel()
+    args = (torch.rand(2),torch.rand(4))
+    sym_a = torch.export.Dim("a", max=10)
+    sym_b = sym_a*2
+    dynamic_shapes = ({0: sym_a}, {0: sym_b})
+
+    with torch.no_grad():
+      exported = torch.export.export(model, args=args, dynamic_shapes=dynamic_shapes)
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+
+    self.assertRegex(module_str, r"stablehlo.constant.*10")
+    self.assertRegex(module_str, r"shape_assertion.*s[0-9]+ <= 10")
+    self.assertRegex(module_str, r"stablehlo.constant.*2")
+    self.assertRegex(module_str, r"shape_assertion.*2\*s[0-9]+")
+  
+  def test_constraint_indirection(self):
+    """Test a model where none of the shapes are directly symbolic variables
+    but all are expressions of symints that don't appear directly in the model.
+    """
+
+    # Arg shapes are b=s0{<=10}*2
+    args = (torch.randn(10, 10),)
+    model = AddOne()
+    sym_a = torch.export.Dim("a", max=10)
+    sym_b = sym_a*2
+    dynamic_shapes = ({0: sym_b},)
+
+    with torch.no_grad():
+      exported = torch.export.export(model, args=args, dynamic_shapes=dynamic_shapes)
+    stablehlo = torch_xla2.export.exported_program_to_stablehlo(exported)
+    module_str = str(stablehlo.mlir_module())
+
+    self.assertRegex(module_str, r"shape_assertion.*s[0-9]+ <= 10")
+    self.assertRegex(module_str, r"shape_assertion.*2\*s[0-9]+")
+
diff --git a/experimental/torch_xla2/test/test_unbounded_dynamism.py b/experimental/torch_xla2/test/test_unbounded_dynamism.py
new file mode 100644
index 00000000000..0cd800cb1a7
--- /dev/null
+++ b/experimental/torch_xla2/test/test_unbounded_dynamism.py
@@ -0,0 +1,662 @@
+import re
+import sys
+import unittest
+
+import numpy as np
+import torch
+from torch.export import Dim, export
+from torch_xla2.export import exported_program_to_stablehlo as exp2shlo
+
+## This file is copied from `xla/test/stablehlo/test_unbounded_dynamism.py`
+## To test that torch_xla2 has identical behavior.
+## The only differences in this test files are that torch_xla2 export preserves
+## argument order more often than torch_xla export.
+##
+## This broke ~5 tests, for example: test_bmm_dynamic_out_dim
+##   args = (
+##     torch.rand((8, 128, 256)),
+##     torch.rand((8, 256, 3)),
+##   )
+##   dynamic_shapes = ((None, {2: Dim("dim")}),)
+##   ...
+##   torch_xla_regex = r'%arg.: tensor<8x256x\?xf32>.*%arg.: tensor<8x128x256xf32>.*->.*tensor<8x128x\?xf32>'
+##   torch_xla2_regex = r'%arg.: tensor<8x128x256xf32>.*%arg.: tensor<8x256x\?xf32>.*->.*tensor<8x128x\?xf32>'
+
+# Shim to run tests
+class ExportAdapter():
+  def __init__(self, export):
+    self.export = export
+
+  def get_stablehlo_text(self):
+    return self.export.mlir_module()
+
+def exported_program_to_stablehlo(exported):
+  return ExportAdapter(exp2shlo(exported))
+
+def wrap_func_as_nn_module(f):
+  class M(torch.nn.Module):
+    def __init__(self):
+      super().__init__()
+      
+    def forward(self, *args):
+      return f(*args)
+  return M().eval()
+
+class UnboundedDynamismExportTest(unittest.TestCase):
+
+  def test_add(self):
+    args = (torch.rand((10, 197, 768)), torch.rand((10, 197, 768)))
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.add.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'tensor<\?x197x768xf32>.*tensor<\?x197x768xf32>.*->.*tensor<\?x197x768xf32>',
+            shlo_text) is not None)
+
+  def test_add_scalar(self):
+    args = (torch.rand((10, 197, 768)), 0.345)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.add.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x197x768xf32>.*->.*tensor<\?x197x768xf32>',
+                  shlo_text) is not None)
+
+  def test_addmm(self):
+    args = (torch.rand((5)), torch.rand((10, 5)), torch.rand((5, 5)))
+    dynamic_shapes = ((None, {0: Dim("dim")}, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.addmm.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x5xf32>.*->.*tensor<\?x5xf32>', shlo_text)
+        is not None)
+
+  def test_bmm(self):
+    args = (
+        torch.rand((24, 197, 64)),
+        torch.rand((24, 64, 197)),
+    )
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'%arg.: tensor<\?x197x64xf32>.*%arg.: tensor<\?x64x197xf32>.*->.*tensor<\?x197x197xf32>',
+            shlo_text) is not None)
+
+  def test_bmm_dynamic_out_dim(self):
+    args = (
+        torch.rand((8, 128, 256)),
+        torch.rand((8, 256, 3)),
+    )
+    dynamic_shapes = ((None, {2: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'%arg.: tensor<8x128x256xf32>.*%arg.: tensor<8x256x\?xf32>.*->.*tensor<8x128x\?xf32>',
+            shlo_text) is not None)
+
+  def test_bmm_dynamic_reduction_dim(self):
+    args = (
+        torch.rand((8, 128, 3)),
+        torch.rand((8, 3, 256)),
+    )
+    dynamic_shapes = (({2: Dim("dim")}, {1: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'%arg.: tensor<8x128x\?xf32>.*%arg.: tensor<8x\?x256xf32>.*->.*tensor<8x128x256xf32>',
+            shlo_text) is not None)
+
+  def test_cat(self):
+    args = (torch.rand((10, 1, 768)), torch.rand((10, 196, 768)))
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(
+        lambda x, y: torch.ops.aten.cat.default([x, y], 1))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'%arg.: tensor<\?x1x768xf32>.*%arg.: tensor<\?x196x768xf32>.*->.*tensor<\?x197x768xf32>',
+            shlo_text) is not None)
+
+  def test_conv(self):
+    args = (
+        torch.rand((10, 3, 224, 224)),
+        torch.rand((5, 3, 16, 16)),
+        torch.rand((5)),
+    )
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
+    m = wrap_func_as_nn_module(
+        lambda x, y, z: torch.ops.aten.convolution.default(
+            x,
+            y,
+            z,
+            [16, 16],
+            [0, 0],
+            [1, 1],
+            False,
+            [0, 0],
+            1,
+        ))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x3x224x224xf32>.*->.*tensor<\?x5x14x14xf32>',
+                  shlo_text) is not None)
+
+  def test_conv1d(self):
+    args = (
+        torch.rand((3, 1, 800)),
+        torch.rand((512, 1, 10)),
+    )
+    dynamic_shapes = (({0: Dim("dim")}, None),)
+    # dynamic_shapes = None
+    m = wrap_func_as_nn_module(lambda x, y: torch.ops.aten.convolution.default(
+        x,
+        y,
+        None,
+        [5],
+        [0],
+        [1],
+        False,
+        [0],
+        1,
+    ))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x1x800xf32>.*->.*tensor<\?x512x159xf32>',
+                  shlo_text) is not None)
+
+  def test_cumsum(self):
+    args = (torch.rand((10, 5)), 1)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.cumsum.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x5xf32>.*->.*tensor<\?x5xf32>', shlo_text)
+        is not None)
+
+  def test_div(self):
+    args = (torch.rand((10, 12, 197)), torch.rand((10, 12, 197)))
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.div.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'tensor<\?x12x197xf32>.*tensor<\?x12x197xf32>.*->.*tensor<\?x12x197xf32>',
+            shlo_text) is not None)
+
+  def test_div_scalar(self):
+    args = (torch.rand((10, 12, 197)), 8.0)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.div.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x12x197xf32>.*->.*tensor<\?x12x197xf32>',
+                  shlo_text) is not None)
+
+  def test_gelu(self):
+    args = (torch.rand((3, 5)),)
+    dynamic_shapes = (({0: Dim("dim")},),)
+    m = wrap_func_as_nn_module(torch.ops.aten.gelu)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x5xf32>.*->.*tensor<\?x5xf32>', shlo_text)
+        is not None)
+
+  def test_embedding(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, y):
+        res = torch.ops.aten.embedding.default(x, y)
+        return res
+
+    args = (torch.rand((20, 768)), torch.randint(0, 15,
+                                                 (3, 10)).to(torch.int64))
+    dynamic_shapes = (None, {0: Dim("dim")})
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x10xi64>.*->.*tensor<\?x10x768xf32>",
+                  shlo_text) is not None)
+
+  def test_mean(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        return torch.mean(x, -1, keepdim=True)
+
+    args = (torch.rand((10, 197, 768)),)
+    dynamic_shapes = ({0: Dim("dim")},)
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x197x768xf32>.*->.*tensor<\?x197x1xf32>",
+                  shlo_text) is not None)
+
+  def test_mul(self):
+    args = (torch.rand((10, 2, 768)), torch.rand((10, 2, 768)))
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.mul.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'tensor<\?x2x768xf32>.*tensor<\?x2x768xf32>.*->.*tensor<\?x2x768xf32>',
+            shlo_text) is not None)
+
+  def test_mul_scalar(self):
+    args = (torch.rand((10, 2, 768)), 0.125)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.mul.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x2x768xf32>.*->.*tensor<\?x2x768xf32>', shlo_text)
+        is not None)
+
+  def test_ne_scalar(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        return torch.ops.aten.ne.Scalar(x, 1).to(torch.int32)
+
+    args = (torch.rand((3, 5)).to(torch.int64),)
+    dynamic_shapes = ({0: Dim("dim")},)
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x5xi64>.*->.*tensor<\?x5xi32>", shlo_text)
+        is not None)
+
+  def test_var(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        return torch.var(x, -1, keepdim=True, correction=0)
+
+    args = (torch.rand((10, 197, 768)),)
+    dynamic_shapes = ({0: Dim("dim")},)
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x197x768xf32>.*->.*tensor<\?x197x1xf32>",
+                  shlo_text) is not None)
+
+  def test_native_group_norm(self):
+
+    class M2(torch.nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.layer_norm = torch.nn.GroupNorm(
+            num_groups=512, num_channels=512, affine=True)
+
+      def forward(self, x):
+        x = self.layer_norm(x)
+        return x
+
+    args = (torch.rand((10, 512, 159)),)
+    dynamic_shapes = ({0: Dim("dim")},)
+    m = M2()
+    out1 = m(*args)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x512x159xf32>.*->.*tensor<\?x512x159xf32>",
+                  shlo_text) is not None)
+
+  def test_native_layer_norm(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, weight, bias):
+        return torch.ops.aten.native_layer_norm.default(x, [768], weight, bias,
+                                                        1e-12)[0]
+
+    args = (
+        torch.rand((10, 197, 768)),
+        torch.rand((768)),
+        torch.rand((768)),
+    )
+    dynamic_shapes = ({0: Dim("dim")}, None, None)
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x197x768xf32>.*->.*tensor<\?x197x768xf32>",
+                  shlo_text) is not None)
+
+  def test_permute(self):
+    args = (torch.rand((10, 197, 12, 64)),)
+    dynamic_shapes = (({0: Dim("dim")},),)
+    m = wrap_func_as_nn_module(
+        lambda x: torch.ops.aten.permute.default(x, [0, 2, 1, 3]))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x197x12x64xf32>.*->.*tensor<\?x12x197x64xf32>",
+            shlo_text) is not None)
+
+  def test_select(self):
+    args = (torch.rand((10, 197, 768)), 1, 0)
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.select.int)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x197x768xf32>.*->.*tensor<\?x768xf32>",
+                  shlo_text) is not None)
+
+  def test_slice(self):
+    args = (torch.rand((10, 3, 224, 224)), 0, 0, 9223372036854775807)
+    dynamic_shapes = (({0: Dim("dim")}, None, None, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.slice.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x3x224x224xf32>.*->.*tensor<\?x3x224x224xf32>",
+            shlo_text) is not None)
+
+  def test_slice_2(self):
+    args = (torch.rand((10, 3, 224, 224)), 1, 0, 2)
+    dynamic_shapes = (({0: Dim("dim")}, None, None, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten.slice.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x3x224x224xf32>.*->.*tensor<\?x2x224x224xf32>",
+            shlo_text) is not None)
+
+  def test_softmax(self):
+    args = (torch.rand((10, 12, 197, 197)), -1, False)
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten._softmax.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x12x197x197xf32>.*->.*tensor<\?x12x197x197xf32>",
+            shlo_text) is not None)
+
+  def test_sub(self):
+    args = (torch.rand((10, 1, 1, 10)), torch.rand((10, 1, 1, 10)))
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.sub.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r'tensor<\?x1x1x10xf32>.*tensor<\?x1x1x10xf32>.*->.*tensor<\?x1x1x10xf32>',
+            shlo_text) is not None)
+
+  def test_softmax_reduce_on_dynamic_dim(self):
+    args = (torch.rand((1, 8, 128, 3)), -1, False)
+    dynamic_shapes = (({3: Dim("dim")}, None, None),)
+    m = wrap_func_as_nn_module(torch.ops.aten._softmax.default)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<1x8x128x\?xf32>.*->.*tensor<1x8x128x\?xf32>",
+                  shlo_text) is not None)
+
+  @unittest.skip("Converted StableHLO contains i1 dtype, not expected.")
+  def test_index(self):
+    args = (torch.rand((2, 10)), torch.arange(5))
+    dynamic_shapes = ((None, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(
+        lambda x, y: torch.ops.aten.index.Tensor(x, [None, y]))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?xi64>.*%arg.: tensor<2x10xf32>.*->.*tensor<2x\?xf32>",
+            shlo_text) is not None)
+
+  def test_sub_scalar(self):
+    args = (1.0, torch.rand((10, 1, 1, 10)))
+    dynamic_shapes = ((None, {0: Dim("dim")}),)
+    m = wrap_func_as_nn_module(torch.ops.aten.sub.Tensor)
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r'tensor<\?x1x1x10xf32>.*->.*tensor<\?x1x1x10xf32>',
+                  shlo_text) is not None)
+
+  def test_split_with_sizes(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        res = torch.ops.aten.split_with_sizes.default(x, [1, 2, 3], -1)
+        return res[0], res[1], res[2]
+
+    args = (torch.rand((3, 10, 6)),)
+    dynamic_shapes = ({0: Dim("dim")},)
+    m = M()
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x10x6xf32>.*->.*tensor<\?x10x1xf32>.*tensor<\?x10x2xf32>.*tensor<\?x10x3xf32>",
+            shlo_text) is not None)
+
+  def test_transpose_on_dynamic_dim(self):
+    args = (torch.rand((1, 8, 3, 256)),)
+    dynamic_shapes = (({2: Dim("dim")},),)
+    m = wrap_func_as_nn_module(
+        lambda x: torch.ops.aten.transpose.int(x, -2, -1))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<1x8x\?x256xf32>.*->.*tensor<1x8x256x\?xf32>",
+                  shlo_text) is not None)
+
+  def test_unsqueeze_1(self):
+    args = (torch.rand((3, 10)),)
+    dynamic_shapes = (({0: Dim("dim")},),)
+    m = wrap_func_as_nn_module(lambda x: torch.ops.aten.unsqueeze.default(x, 1))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x10xf32>.*->.*tensor<\?x1x10xf32>",
+                  shlo_text) is not None)
+
+  def test_unsqueeze_2(self):
+    args = (torch.rand((1, 1, 3, 256)),)
+    dynamic_shapes = (({2: Dim("dim")},),)
+    m = wrap_func_as_nn_module(lambda x: torch.ops.aten.unsqueeze.default(x, 2))
+    ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<1x1x\?x256xf32>.*->.*tensor<1x1x1x\?x256xf32>",
+            shlo_text) is not None)
+
+  def test_dynamic_view(self):
+
+    class M(torch.nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 5, [16, 16])
+
+      def forward(self, x):
+        x = self.conv(x)
+        return x.view(x.shape[0], x.shape[1], -1)
+
+    m = M().eval()
+    args = (torch.rand((10, 3, 224, 224)),)
+    dynamic_shapes = ({0: Dim("bs")},)
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x3x224x224xf32>.*->.*tensor<\?x5x43681xf32>",
+            shlo_text) is not None)
+
+  @unittest.skip("Cannot generate aten.sym_numel in the exported program.")
+  def test_dynamic_view_sym_numel(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, range):
+        num_elem = torch.numel(range)
+        return x.view(x.shape[0], x.shape[2], num_elem, x.shape[4])
+
+    m = M().eval()
+    args = (torch.rand((1, 1, 8, 3, 256)), torch.arange(3))
+    dynamic_shapes = ({3: Dim("bs")}, {0: Dim("bs")})
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<\?x3x224x224xf32>.*->.*tensor<\?x5x43681xf32>",
+            shlo_text) is not None)
+
+  def test_dynamic_view_non_bs(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        return x.view(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+
+    m = M().eval()
+    args = (torch.rand((1, 3, 2, 16)),)
+    dynamic_shapes = ({1: Dim("bs")},)
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<1x\?x2x16xf32>.*->.*tensor<1x\?x16xf32>",
+                  shlo_text) is not None)
+
+  def test_dynamic_view_multiplier(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        return x.view(x.shape[0] * x.shape[1], -1)
+
+    m = M().eval()
+    args = (torch.rand((10, 197, 768)),)
+    dynamic_shapes = ({0: Dim("bs")},)
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<\?x197x768xf32>.*->.*tensor<\?x768xf32>",
+                  shlo_text) is not None)
+
+  def test_dynamic_expand(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, image):
+        return x.expand(image.shape[0], -1, -1)
+
+    m = M().eval()
+    args = (torch.rand((1, 1, 768)), torch.rand((10, 3, 224, 224)))
+    dynamic_shapes = (
+        None,
+        {
+            0: Dim("bs")
+        },
+    )
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"%arg.: tensor<1x1x768xf32>.*->.*tensor<\?x1x768xf32>",
+                  shlo_text) is not None)
+
+  def test_dynamic_expand_2(self):
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, range):
+        return x.expand(1, 1, 8, range.shape[0], 256)
+
+    m = M().eval()
+    args = (torch.rand((1, 1, 1, 3, 256)), torch.arange(3))
+    dynamic_shapes = ({3: Dim("bs")}, {0: Dim("bs")})
+    ep = export(m, args, dynamic_shapes=dynamic_shapes)
+    out1 = ep.module()(*args)
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"%arg.: tensor<1x1x1x\?x256xf32>.*->.*tensor<1x1x8x\?x256xf32>",
+            shlo_text) is not None)
+
+
+if __name__ == "__main__":
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/experimental/torch_xla2/test_requirements.txt b/experimental/torch_xla2/test_requirements.txt
deleted file mode 100644
index c8596327236..00000000000
--- a/experimental/torch_xla2/test_requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-pytest
-immutabledict
-sentencepiece
-pytest-xdist
-expecttest
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/__init__.py b/experimental/torch_xla2/torch_xla2/__init__.py
index b0bb20712d4..bd0e00fa6ca 100644
--- a/experimental/torch_xla2/torch_xla2/__init__.py
+++ b/experimental/torch_xla2/torch_xla2/__init__.py
@@ -1,31 +1,34 @@
-import contextlib
 import jax
 import torch
 from torch._functorch import make_functional
 from torch.utils import _pytree as pytree
-from torch_xla2 import export, _ops, ops_registry, tensor, tf_integration, functions
+from torch_xla2 import export, tensor, tf_integration
 
 jax.config.update('jax_enable_x64', True)
 
+env = None
+def default_env():
+  global env
+  if env is None:
+    env = tensor.Environment(0)
+  return env
 
-@contextlib.contextmanager
-def mode():
-  with tensor.XLADispatchMode(), functions.XLAFunctionMode():
-    yield
 
 
-def extract_jax(mod: torch.nn.Module):
+def extract_jax(mod: torch.nn.Module, env=None):
   """Returns a pytree of jax.ndarray and a jax callable."""
+  if env is None:
+    env = default_env()
   func, weights, buffer = make_functional.make_functional_with_buffers(mod)
-  states = (weights, buffer)
+  states = mod.state_dict()
+
   states = pytree.tree_map_only(torch.Tensor, tensor.t2j, states)
 
   #@jax.jit
   def jax_func(states, inputs):
-    (states, inputs) = tensor.wrap((states, inputs))
-    weights, buffer = states
-    with tensor.XLADispatchMode():
-      res = func(weights, buffer, *inputs)
-    return tensor.unwrap(res)
+    (states, inputs) = env.j2t_iso((states, inputs))
+    with env:
+      res = torch.func.functional_call(mod, states, inputs)
+    return env.t2j_iso(res)
 
   return states, jax_func
diff --git a/experimental/torch_xla2/torch_xla2/_ops.py b/experimental/torch_xla2/torch_xla2/_ops.py
deleted file mode 100644
index fe0f97a0f01..00000000000
--- a/experimental/torch_xla2/torch_xla2/_ops.py
+++ /dev/null
@@ -1,1745 +0,0 @@
-# pylint: disable
-"""Torch ops implemented using jax."""
-import sys
-
-import jax
-from jax import numpy as jnp
-import numpy as np
-import torch
-from torch_xla2 import ops_registry
-from torch_xla2 import tensor
-
-
-class TorchFunctionLowering:
-
-  def __init__(self, func, is_jax_func, should_jit=False):
-    if is_jax_func and should_jit:
-      func = jax.jit(func)
-    self.func = func
-    self.is_jax_func = is_jax_func
-
-  def __call__(self, *args, **kwargs):
-    if self.is_jax_func:
-      (args, kwargs) = tensor.unwrap((args, kwargs))
-    res = self.func(*args, **kwargs)
-    if self.is_jax_func:
-      res = tensor.wrap(res)
-    return res
-
-
-def op(aten_op, is_jax_func=True):
-  """if is_jax_func is true, then the function it will register
-
-  should takes jax array as input and returns jax array.
-
-  Which means we need to wrap it
-  """
-
-  def inner(func):
-    ops_registry.lowerings.register(aten_op,
-                                    TorchFunctionLowering(func, is_jax_func))
-    return func
-
-  return inner
-
-
-@op(torch.ops.aten.view_copy)
-@op(torch.ops.aten.view)
-@op(torch.ops.aten._unsafe_view)
-@op(torch.ops.aten.reshape)
-def _aten_unsafe_view(x, shape):
-  return jnp.reshape(x, shape)
-
-
-@op(torch.ops.aten.add)
-def _aten_add(x, y, *, alpha=1):
-  """if isinstance(x, jnp.ndarray) and isinstance(y, jnp.ndarray):
-
-  assert x.dtype == y.dtype, (x.dtype, y.dtype)
-  """
-  return x + y * alpha
-
-
-@op(torch.ops.aten.copy_, is_jax_func=False)
-def _aten_copy(x, y, memory_format=None):
-  if isinstance(x, tensor.XLATensor2):
-    x._elem = y._elem
-  elif isinstance(x, tensor.SliceView):
-    x.mutate(y)
-  return x
-
-
-@op(torch.ops.aten.clone)
-def _aten_clone(x, memory_format=None):
-  return jnp.copy(x)
-
-
-@op(torch.ops.aten.full)
-def _aten_full(size, value, **kwargs):
-  return jnp.full(size, value)
-
-
-@op(torch.ops.aten.index_copy)
-def _aten_index_copy(x, dim, indexes, source):
-  # return jax.lax.scatter(x, index, dim)
-  dims = []
-  for i in range(len(x.shape)):
-    if i == dim:
-      dims.append(indexes)
-    else:
-      dims.append(slice(None, None, None))
-  return x.at[dim].set(source)
-
-
-@op(torch.ops.aten.select)
-@op(torch.ops.aten.index_select)
-@op(torch.ops.aten.select_copy)
-def _aten_index_select(x, dim, indexes):
-  dims = []
-  for i in range(len(x.shape)):
-    if i == dim:
-      dims.append(indexes)
-    else:
-      dims.append(slice(None, None, None))
-  return x[tuple(dims)]
-
-
-@op(torch.ops.aten.mean)
-def _aten_mean(x, dim=None, keepdim=False):
-  return jnp.mean(x, dim, keepdims=keepdim)
-
-
-def _torch_binary_scalar_type(scalar, tensor):
-  if "float" in str(tensor.dtype):
-    return tensor.dtype
-
-  if isinstance(scalar, int):
-    if "int" in str(tensor.dtype):
-      return tensor.dtype
-
-  return jnp.float32
-
-
-@op(torch.ops.aten.sub)
-def _aten_sub(x, y):
-  if isinstance(x, float):
-    dtype = _torch_binary_scalar_type(x, y)
-    x = jnp.array(x, dtype=dtype)
-  if isinstance(y, float):
-    dtype = _torch_binary_scalar_type(y, x)
-    y = jnp.array(y, dtype=dtype)
-  return x - y
-
-
-@op(torch.ops.aten.mm)
-def _aten_mm(x, y):
-  res = x @ y
-  return res
-
-
-@op(torch.ops.aten.mul)
-def _aten_mul(x, y):
-  return x * y
-
-
-@op(torch.ops.aten.silu)
-def _aten_silu(x):
-  return jax.nn.silu(x)
-
-
-@op(torch.ops.aten.t)
-def _aten_t(x):
-  return jnp.transpose(x)
-
-
-@op(torch.ops.aten.transpose)
-@op(torch.ops.aten.transpose_copy)
-def _aten_transpose(x, dim0, dim1):
-  shape = list(range(len(x.shape)))
-  shape[dim0], shape[dim1] = shape[dim1], shape[dim0]
-  return jnp.transpose(x, shape)
-
-
-@op(torch.ops.aten.triu)
-def _aten_triu(m, k):
-  return jnp.triu(m, k)
-
-
-@op(torch.ops.aten.slice)
-@op(torch.ops.aten.slice_copy)
-def _aten_slice(self, dim=0, start=None, end=None, step=1):
-  if end == sys.maxsize:
-    end = self.shape[dim]
-  sl = slice(start, end, step)
-  dims = []
-  for i in range(len(self.shape)):
-    if i == dim:
-      dims.append(sl)
-    else:
-      dims.append(slice(None, None, None))
-  return self[tuple(dims)]
-
-
-@op(torch.ops.aten.detach)
-def _aten_detach(self):
-  return self
-
-
-@op(torch.ops.aten.view_as_real)
-def _aten_view_as_real(x):
-  real = jnp.real(x)
-  im = jnp.imag(x)
-  res = jnp.stack([real, im], -1)
-  return res
-
-
-@op(torch.ops.aten.stack)
-def _aten_stack(tensors, dim=0):
-  return jnp.stack(tensors, dim)
-
-
-@op(torch.ops.aten._softmax)
-def _aten_softmax(x, dim, halftofloat):
-  return jax.nn.softmax(x, dim)
-
-
-@op(torch.ops.aten.pow)
-def _aten_pow(x, y):
-  if isinstance(y, int):
-    y = float(y)
-  return jnp.power(x, y)
-
-
-@op(torch.ops.aten.view_as_complex)
-def _aten_view_as_complex(input):
-  if input.dtype == jnp.bfloat16:
-    input = input.astype(jnp.float32)
-  x, y = input[..., 0], input[..., 1]
-  return jax.lax.complex(x, y)
-
-
-@op(torch.ops.aten.div)
-def _aten_div(x, y, rounding_mode=""):
-  res = x / y
-  if rounding_mode == "trunc":
-    res = jnp.trunc(res)
-  return res
-
-
-@op(torch.ops.aten.div_, is_jax_func=False)
-def _aten_div_(x, y, rounding_mode=""):
-  x._elem = _aten_div(x._elem, y._elem, rounding_mode)
-  return x
-
-
-@op(torch.ops.aten.true_divide)
-def _aten_true_divide(x, y):
-  return x / y
-
-
-@op(torch.ops.aten.bmm)
-def _aten_bmm(x, y):
-  res = x @ y
-  return res
-  # return jnp.einsum('bnm,bmk->bnk', x, y)
-
-
-@op(torch.ops.aten.embedding)
-# embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False)
-def _aten_embedding(a, w, padding_idx=-1):
-  return jnp.take(a, w, axis=0)
-
-
-@op(torch.ops.aten.rsqrt)
-def _aten_rsqrt(x):
-  if isinstance(x, int):
-    x = float(x)
-  if x.dtype == jnp.int32:
-    x = x.astype(jnp.float32)
-  return jax.lax.rsqrt(x)
-
-
-@op(torch.ops.aten.expand)
-@op(torch.ops.aten.expand_copy)
-def _aten_expand(x, dims):
-
-  def fix_dims(d, xs):
-    if d == -1:
-      return xs
-    return d
-
-  dims = [fix_dims(p, s) for p, s in zip(dims, x.shape)]
-  return jnp.broadcast_to(x, dims)
-
-
-@op(torch.ops.aten.dot)
-def _aten_dot(x, y):
-  return jnp.dot(x, y)
-
-
-@op(torch.ops.aten._to_copy)
-def _aten__to_copy(self, **kwargs):
-  dtype = tensor.t2j_dtype(kwargs["dtype"])
-  if dtype != self.dtype:
-    return self.astype(dtype)
-  return jnp.copy(self)
-
-
-@op(torch.ops.aten.empty)
-def _aten_empty(sizes, **kwargs):
-  return jnp.zeros(sizes)
-
-
-@op(torch.ops.aten.index_put_)
-@op(torch.ops.aten.index_put)
-def _aten_index_put(self, indexes, values, accumulate=False):
-  indexes = [slice(None, None, None) if i is None else i for i in indexes]
-  indexes = tuple(indexes)
-  if accumulate:
-    return self.at[indexes].add(values)
-  else:
-    return self.at[indexes].set(values)
-
-
-@op(torch.ops.aten.index)
-@op(torch.ops.aten._unsafe_index)
-@op(torch.ops.aten.index.Tensor)
-def _aten_index(self, indexes):
-  indexes = [slice(None, None, None) if i is None else i for i in indexes]
-  indexes = tuple(indexes)
-  return self[indexes]
-
-
-@op(torch.ops.aten.split)
-@op(torch.ops.aten.split_copy)
-@op(torch.ops.aten.split_with_sizes)
-def split_with_sizes(x, sizes, dim=0):
-  """Splits an array `x` into sub-arrays based on static sizes `sizes`.
-
-  Args:
-    x: The input array to split.
-    sizes: A 1D array of integer sizes for each sub-array.
-
-  Returns:
-    A list of sub-arrays.
-  """
-  if isinstance(sizes, int):
-    # split equal size
-    new_sizes = [sizes] * (x.shape[dim] // sizes)
-    sizes = new_sizes
-  rank = x.ndim
-  splits = np.cumsum(sizes)  # Cumulative sum for split points
-
-  def make_range(rank, dim, start, end):
-    res = [slice(None, None, None)] * rank
-    res[dim] = slice(start, end)
-    return tuple(res)
-
-  return [
-      x[make_range(rank, dim, start, end)]
-      for start, end in zip([0] + list(splits[:-1]), splits)
-  ]
-
-
-@op(torch.ops.aten.permute)
-@op(torch.ops.aten.permute_copy)
-def permute(t, dims):
-  return jnp.transpose(t, dims)
-
-
-@op(torch.ops.aten.unsqueeze)
-@op(torch.ops.aten.unsqueeze_copy)
-@op(torch.ops.aten.unsqueeze.default)
-def _aten_unsqueeze(self, dim):
-  if dim < 0:
-    dim += self.ndim + 1
-  return jnp.expand_dims(self, dim)
-
-
-@op(torch.ops.aten.ne)
-def _aten_ne(x, y):
-  return jnp.not_equal(x, y)
-
-
-@op(torch.ops.aten.cumsum)
-def _aten_cumsum(x, y, dtype=None):
-  dtype = tensor.t2j_dtype(dtype)
-  res = jnp.cumsum(x, y, dtype)
-  return res
-
-
-@op(torch.ops.aten.native_layer_norm)
-def _aten_native_layer_norm(input,
-                            normalized_shape,
-                            weight=None,
-                            bias=None,
-                            eps=1e-5):
-  """Implements layer normalization in Jax as defined by `aten::native_layer_norm`.
-
-  Args:
-    input: The input tensor.
-    normalized_shape: A list of integer dimensions to be normalized over.
-    weight: Optional weight tensor for the affine transformation.
-    bias: Optional bias tensor for the affine transformation.
-    eps: A small epsilon value for numerical stability.
-
-  Returns:
-    output: The normalized tensor.
-    mean: The calculated mean tensor.
-    std: The calculated standard deviation tensor.
-  """
-  if isinstance(normalized_shape, int):
-    normalized_shape = [normalized_shape]
-  axis = [i for i, d in enumerate(input.shape) if d in normalized_shape]
-
-  # Calculate mean and standard deviation
-  mean = jnp.mean(input, axis=axis, keepdims=True)
-  var = jnp.var(input, axis=axis, keepdims=True)
-  rstd = jax.lax.rsqrt(var + eps)
-
-  # Normalize the input
-  norm_x = (input - mean) * rstd
-
-  # Apply affine transformation (if provided)
-  if weight is not None:
-    norm_x *= weight
-  if bias is not None:
-    norm_x += bias
-  return norm_x, mean, rstd
-
-
-# - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-@op(torch.ops.aten.addmm)
-def _aten_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0):
-  self *= beta
-  self += alpha * jnp.matmul(mat1, mat2)
-  return self
-
-
-@op(torch.ops.aten.gelu)
-def _aten_gelu(self, *, approximate="none"):
-  approx = approximate == "tanh"
-  return jax.nn.gelu(self, approx)
-
-
-@op(torch.ops.aten.squeeze)
-@op(torch.ops.aten.squeeze_copy)
-def _aten_squeeze_dim(self, dim):
-  """Squeezes a Jax tensor by removing a single dimension of size 1.
-
-  Args:
-    self: The input tensor.
-    dim: The dimension to squeeze.
-
-  Returns:
-    The squeezed tensor with the specified dimension removed if it is 1,
-    otherwise the original tensor is returned.
-  """
-
-  # Validate input arguments
-  if not isinstance(self, jnp.ndarray):
-    raise TypeError(f"Expected a Jax tensor, got {type(self)}.")
-  if isinstance(dim, int):
-    dim = [dim]
-
-  # Check if the specified dimension has size 1
-  if all([self.shape[d] != 1 for d in dim]):
-    return self
-
-  # Use slicing to remove the dimension if it is 1
-  new_shape = list(self.shape)
-
-  def fix_dim(p):
-    if p < 0:
-      return p + len(self.shape)
-    return p
-
-  dim = [fix_dim(d) for d in dim]
-  new_shape = [p for i, p in enumerate(self.shape) if i not in dim or p != 1]
-  return self.reshape(new_shape)
-
-
-@op(torch.ops.aten.convolution)
-def _aten_convolution(
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    transposed,
-    output_padding,
-    groups,
-):
-  if transposed:
-    raise NotImplementedError("Transposed convolution is not implemented.")
-
-  def make_padding(padding):
-    return ((p, p) for p in padding)
-
-  def create_default_conv_dimension_numbers(num_spatial_dims):
-    # Ref: https://github.com/openxla/xla/blob/main/xla/client/xla_builder.cc#L4211
-    # (batch dimension, feature dimension, spatial dimensions...)
-    lhs_spec = [0, 1]
-    # (out feature dimension, in feature dimension, spatial dimensions...)
-    rhs_spec = [0, 1]
-    # (batch dimension, feature dimension, spatial dimensions...)
-    out_spec = [0, 1]
-    for i in range(0, num_spatial_dims):
-      lhs_spec.append(i + 2)
-      rhs_spec.append(i + 2)
-      out_spec.append(i + 2)
-    return jax.lax.ConvDimensionNumbers(
-        *map(tuple, (lhs_spec, rhs_spec, out_spec)))
-
-  res = jax.lax.conv_general_dilated(
-      input,
-      weight,
-      stride,
-      make_padding(padding),
-      lhs_dilation=(1,) * len(stride),
-      rhs_dilation=dilation,
-      dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
-      feature_group_count=groups,
-      batch_group_count=1,
-  )
-
-  if bias is not None:
-    # TODO(qihqi): bias always on channel?
-    if len(bias.shape) == 1:
-      shape = [1] * len(res.shape)
-      shape[1] = bias.shape[0]
-      bias = bias.reshape(tuple(shape))
-    res = res + bias
-  return res
-
-
-# _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps)
-@op(torch.ops.aten._native_batch_norm_legit)
-def _aten__native_batch_norm_legit(input, weight, bias, running_mean,
-                                   running_var, training, momentum, eps):
-  return _aten__native_batch_norm_legit_no_training(input, weight, bias,
-                                                    running_mean, running_var,
-                                                    momentum, eps)
-
-
-@op(torch.ops.aten._native_batch_norm_legit_no_training)
-def _aten__native_batch_norm_legit_no_training(input, weight, bias,
-                                               running_mean, running_var,
-                                               momentum, eps):
-  if weight is None:
-    weight = jnp.ones_like(running_mean)
-  if bias is None:
-    bias = jnp.zeros_like(running_mean)
-
-  def broadcast(t):
-    return jax.lax.broadcast_in_dim(t, input.shape, broadcast_dimensions=(1,))
-
-  a = input - broadcast(running_mean)
-  b = broadcast(jnp.sqrt(running_var + eps))
-  return (
-      a / b * broadcast(weight) + broadcast(bias),
-      jnp.array([]),
-      jnp.array([]),
-  )
-
-
-@op(torch.ops.aten.relu)
-def _aten_relu(self):
-  return jax.nn.relu(self)
-
-
-@op(torch.ops.aten.cat)
-def _aten_cat(tensors, dims=0):
-  return jnp.concatenate(tensors, dims)
-
-
-@op(torch.ops.aten.max_pool2d_with_indices)
-@op(torch.ops.aten.max_pool3d_with_indices)
-def _aten_max_pool2d_with_indices(inputs,
-                                  kernel_size,
-                                  strides,
-                                  padding=0,
-                                  dilation=1,
-                                  ceil_mode=False):
-  num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
-  kernel_size = tuple(kernel_size)
-  strides = tuple(strides)
-  if isinstance(padding, int):
-    padding = tuple((padding, padding) for _ in range(len(kernel_size)))
-  elif isinstance(padding, list):
-    padding = tuple((p, p) for p in padding)
-
-  window_shape = kernel_size
-  num_batch_dims = inputs.ndim - (len(window_shape) + 1)
-  strides = strides or (1,) * len(window_shape)
-  assert len(window_shape) == len(
-      strides), f'len({window_shape}) must equal len({strides})'
-  strides = (1,) * (1 + num_batch_dims) + strides
-  dims = (1,) * (1 + num_batch_dims) + window_shape
-
-  is_single_input = False
-  if num_batch_dims == 0:
-    # add singleton batch dimension because lax.reduce_window always
-    # needs a batch dimension.
-    inputs = inputs[None]
-    strides = (1,) + strides
-    dims = (1,) + dims
-    is_single_input = True
-
-  assert inputs.ndim == len(dims), f'len({inputs.shape}) != len({dims})'
-  if not isinstance(padding, str):
-    padding = tuple(map(tuple, padding))
-    assert len(padding) == len(window_shape), (
-        f'padding {padding} must specify pads for same number of dims as '
-        f'window_shape {window_shape}')
-    assert all([len(x) == 2 for x in padding
-               ]), f'each entry in padding {padding} must be length 2'
-    padding = ((0, 0), (0, 0)) + padding
-
-  indices = jnp.arange(np.prod(inputs.shape)).reshape(inputs.shape)
-
-  def reduce_fn(a, b):
-    ai, av = a
-    bi, bv = b
-    which = av > bv
-    return jnp.where(which, ai, bi), jnp.where(which, av, bv)
-
-  init_val = -jnp.inf
-  if inputs.dtype in (jnp.int32, jnp.int64):
-    init_val = -(1 << 31)
-  init_val = jnp.array(init_val).astype(inputs.dtype)
-
-  indices, y = jax.lax.reduce_window((indices, inputs), (0, init_val),
-                                     reduce_fn, dims, strides, padding)
-  if is_single_input:
-    indices = jnp.squeeze(indices, axis=0)
-    y = jnp.squeeze(y, axis=0)
-  return y, indices
-
-  batch_result = pool(inputs, -jnp.inf, jax.lax.max, kernel_size, strides,
-                      padding)
-  indices = pool(inputs, 0, jnp.argmax, kernel_size, strides, padding)
-  return batch_result, indices
-
-
-# TODO add more ops
-
-
-@op(torch.ops.aten.min)
-def _aten_min(x, axis=None):
-  return jnp.min(x, axis=axis), jnp.argmin(x, axis=axis).astype(jnp.int64)
-
-
-@op(torch.ops.aten.amin)
-def _aten_amin(x, axis=None):
-  return jnp.min(x, axis=axis)
-
-
-@op(torch.ops.aten.argmin)
-def _aten_amin(x, axis=None):
-  return jnp.argmin(x, axis=axis)
-
-
-@op(torch.ops.aten.sin)
-def _aten_sin(x):
-  return jnp.sin(x)
-
-
-@op(torch.ops.aten.sym_size)
-def _aten_sym_size(x, dim):
-  return x.shape[dim]
-
-
-@op(torch.ops.aten.var)
-@op(torch.ops.prims.var)
-def _aten_var(x, dim=None, *, correction=1, keepdim=False, out=None):
-  return jnp.var(x, axis=dim, ddof=correction, keepdims=keepdim)
-
-
-@op(torch.ops.prims.broadcast_in_dim)
-def _prims_broadcast_in_dim(t, shape, broadcast_dimensions):
-  return jax.lax.broadcast_in_dim(
-      t, shape, broadcast_dimensions=broadcast_dimensions)
-
-
-# aten.native_group_norm -- should use decomp table
-# func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
-
-
-@op(torch.ops.aten.native_group_norm)
-def _aten_native_group_norm(input, weight, bias, N, C, HxW, group, eps=1e-5):
-  """Group Normalization implementation in JAX.
-
-  Args:
-    input: Input tensor. Expected shape (batch_size, channels, ... spatial dims
-      ...)
-    weight: Optional scaling (gamma) parameter. Shape (channels,)
-    bias: Optional shifting (beta) parameter. Shape (channels,)
-    N: Batch size.
-    C: Number of channels.
-    HxW: Product of spatial dimensions (number of elements per channel after
-      flattening).
-    group: Number of groups for Group Normalization.
-    eps: Small value added for numerical stability.
-
-  Returns:
-    A tuple of (normalized_output, mean, rstd)
-  """
-
-  input_shape = input.shape
-
-  # Reshape for group-wise normalization
-  reshaped_input = jnp.reshape(input, (1, N * group, -1))
-
-  # **Core Group Normalization**
-  def group_norm_body(x):  # Function to apply within each group
-    mean = jnp.mean(x, axis=-1, keepdims=True)
-    var = jnp.var(x, axis=-1, keepdims=True)
-    rstd = jax.lax.rsqrt(var + eps)  # Reciprocal of std with epsilon
-    normalized = (x - mean) * rstd
-    return normalized, mean, rstd
-
-  normalized, group_mean, group_rstd = jax.lax.map(group_norm_body,
-                                                   reshaped_input)
-
-  # Reshape back to original input shape
-  output = jnp.reshape(normalized, input_shape)
-
-  # **Affine transformation**
-  affine_shape = [-1 if i == 1 else 1 for i in range(input.ndim)
-                 ]  # Shape for broadcasting
-  if weight is not None and bias is not None:
-    output = bias.reshape(affine_shape) + output * weight.reshape(affine_shape)
-  elif weight is not None:
-    output = output * weight.reshape(affine_shape)
-  elif bias is not None:
-    output = output + bias.reshape(affine_shape)
-
-  # Reshape mean and rstd
-  mean = jnp.reshape(group_mean, (N, group))
-  rstd = jnp.reshape(group_rstd, (N, group))
-
-  return output, mean, rstd
-
-
-@op(torch.ops.aten.linalg_vector_norm)
-def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None):
-  """Calculates the vector norm along specified dimensions.
-
-  Args:
-      self: The input tensor.
-      ord: The order of the norm. Can be a float or 'inf', '-inf', 'fro'.
-        Default is 2 (Euclidean norm).
-      dim: Dimensions along which to calculate the norm. If None, the norm is
-        calculated over all dimensions.
-      keepdim: Whether to keep the reduced dimensions.
-      dtype: Optional data type for the output.
-
-  Returns:
-      The tensor containing the calculated vector norms.
-  """
-
-  if ord not in {2, float("inf"), float("-inf"), "fro"}:
-    raise ValueError(
-        f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and"
-        " 'fro'.")
-
-  # Special cases (for efficiency and clarity)
-  if ord == 2:  # Euclidean norm
-    result = jnp.sqrt(jnp.sum(jnp.abs(self)**2, axis=dim, keepdims=keepdim))
-
-  elif ord == float("inf"):
-    result = jnp.max(jnp.abs(self), axis=dim, keepdims=keepdim)
-
-  elif ord == float("-inf"):
-    result = jnp.min(jnp.abs(self), axis=dim, keepdims=keepdim)
-
-  elif ord == "fro":  # Frobenius norm
-    result = jnp.sqrt(jnp.sum(jnp.abs(self)**2, axis=dim, keepdims=keepdim))
-
-  else:  # General case (e.g., ord = 1, ord = 3)
-    result = jnp.sum(
-        jnp.abs(self)**ord, axis=dim, keepdims=keepdim)**(1.0 / ord)
-
-  # (Optional) dtype conversion
-  if dtype is not None:
-    result = result.astype(dtype)
-
-  return result
-
-
-# aten.reflection_pad1d
-@op(torch.ops.aten.reflection_pad1d)
-def _aten_reflection_pad1d(input, padding):
-  rank = len(input.shape)
-  pad_size = [(0, 0)] * rank
-  pad_size[-1] = padding
-  return jnp.pad(input, pad_size, mode="reflect")
-
-
-# aten.alias
-@op(torch.ops.aten.alias)
-def _aten_alias(self, *args):
-  return self
-
-
-# aten.sinh
-@op(torch.ops.aten.sinh)
-def _aten_sinh(self):
-  return jnp.sinh(self)
-
-
-# aten.native_layer_norm_backward
-@op(torch.ops.aten.native_layer_norm_backward)
-def _aten_native_layer_norm_backward(grad_out,
-                                     input,
-                                     normalized_shape,
-                                     weight,
-                                     bias,
-                                     eps=1e-5):
-  """Implements the backward pass of layer normalization in Jax as defined by `aten::native_layer_norm_backward`.
-
-  Args:
-    grad_out: The gradient of the output tensor.
-    input: The input tensor.
-    normalized_shape: A list of integer dimensions to be normalized over.
-    weight: Optional weight tensor for the affine transformation.
-    bias: Optional bias tensor for the affine transformation.
-    eps: A small epsilon value for numerical stability.
-
-  Returns:
-    A tuple of (grad_input, grad_weight, grad_bias).
-  """
-  return jax.lax.native_layer_norm_backward(grad_out, input, normalized_shape,
-                                            weight, bias, eps)
-
-
-# aten.reflection_pad3d_backward
-# aten.reflection_pad2d
-
-
-# aten.atanh
-@op(torch.ops.aten.atanh)
-def _aten_atanh(self):
-  return jnp.arctanh(self)
-
-
-# aten.bitwise_not
-@op(torch.ops.aten.bitwise_not)
-def _aten_bitwise_not(self):
-  return ~self
-
-
-# aten.embedding_dense_backward
-
-
-# aten.sum
-@op(torch.ops.aten.sum)
-def _aten_sum(self, dim=None, keepdim=False, dtype=None):
-  return jnp.sum(self, axis=dim, keepdims=keepdim, dtype=dtype)
-
-
-# aten.sqrt
-@op(torch.ops.aten.sqrt)
-def _aten_sqrt(self):
-  return jnp.sqrt(self)
-
-
-@op(torch.ops.aten.tan)
-def _aten_tanh(self):
-  return jnp.tan(self)
-
-
-# aten.tanh
-@op(torch.ops.aten.tanh)
-def _aten_tanh(self):
-  return jnp.tanh(self)
-
-
-# aten.ceil
-@op(torch.ops.aten.ceil)
-def _aten_ceil(self):
-  return jnp.ceil(self)
-
-
-# aten.asin
-@op(torch.ops.aten.asin)
-def _aten_asin(self):
-  return jnp.arcsin(self)
-
-
-# aten.minimum
-@op(torch.ops.aten.minimum)
-def _aten_minimum(self, other):
-  return jnp.minimum(self, other)
-
-
-# aten.max_pool2d_backward
-
-
-def _scatter_index(dim, index):
-  """Returns a tuple of indexes; 
-
-  The first is to select in input (to modify),
-  the second is to select from the values.
-  """
-  index_shape = list(index.shape)
-  input_indexes = []
-  source_indexes = []
-  for i in range(len(index_shape)):
-    source_indexes.append(slice(0, index_shape[i]))
-    if i == dim:
-      input_indexes.append(index)
-    else:
-      target_shape = [1] * len(index_shape)
-      target_shape[i] = index_shape[i]
-      input_indexes.append(
-          jnp.broadcast_to(
-              jnp.arange(index_shape[i]).reshape(target_shape), index_shape))
-  return tuple(input_indexes), tuple(source_indexes)
-
-
-# aten.scatter_add
-@op(torch.ops.aten.scatter_add)
-def _aten_scatter_add(input, dim, index, src):
-  """JAX implementation of scatter, mimicking torch.scatter behavior"""
-
-  input_indexes, source_indexes = _scatter_index(dim, index)
-  return input.at[input_indexes].add(src[source_indexes])
-
-
-# aten.logical_not
-
-
-# aten.sign
-@op(torch.ops.aten.sign)
-def _aten_sign(x):
-  return jnp.sign(x)
-
-
-# aten.sigmoid
-@op(torch.ops.aten.sigmoid)
-def _aten_sigmoid(x):
-  if x.dtype in (jnp.int32, jnp.int64):
-    x = x.astype(jnp.float32)
-  return jax.nn.sigmoid(x)
-
-
-# implement aten.asinh in jax
-@op(torch.ops.aten.asinh)
-def _aten_asinh(self):
-  return jnp.arcsinh(self)
-
-
-# aten.atan
-@op(torch.ops.aten.atan)
-def _aten_atan(self):
-  return jnp.arctan(self)
-
-
-# aten.scatter_reduce
-@op(torch.ops.aten.scatter_reduce)
-def _aten_scatter_reduce(input, dim, index, src, reduce, *, include_self=True):
-  input_indexes, source_indexes = _scatter_index(dim, index)
-  if reduce == "sum":
-    return input.at[input_indexes].add(src[source_indexes])
-  elif reduce == "prod":
-    return input.at[input_indexes].multiply(src[source_indexes])
-  elif reduce == "mean":
-    return input.at[input_indexes].add(src[source_indexes])
-  elif reduce == "amax":
-    return input.at[input_indexes].max(src[source_indexes])
-  elif reduce == "amin":
-    return input.at[input_indexes].min(src[source_indexes])
-  else:
-    raise RuntimeError('Unknow reduction type: ', reduce)
-
-
-# aten.acos
-@op(torch.ops.aten.acos)
-def _aten_acos(self):
-  return jnp.arccos(self)
-
-
-# aten.sym_storage_offset
-# aten.native_layer_norm_backward
-# aten.max_pool3d_with_indices
-
-
-# aten.gt
-@op(torch.ops.aten.gt)
-def _aten_gt(self, other):
-  return self > other
-
-
-# aten.pixel_shuffle
-@op(torch.ops.aten.pixel_shuffle)
-def _aten_pixel_shuffle(x, upscale_factor):
-  """PixelShuffle implementation in JAX.
-
-  Args:
-    x: Input tensor. Typically a feature map.
-    upscale_factor: Integer by which to upscale the spatial dimensions.
-
-  Returns:
-    Tensor after PixelShuffle operation.
-  """
-
-  batch_size, channels, height, width = x.shape
-
-  if channels % (upscale_factor**2) != 0:
-    raise ValueError(
-        'Number of channels must be divisible by the square of the upscale factor.'
-    )
-
-  new_channels = channels // (upscale_factor**2)
-  new_height = height * upscale_factor
-  new_width = width * upscale_factor
-
-  x = x.reshape(batch_size, new_channels, upscale_factor, upscale_factor,
-                height, width)
-  x = jnp.transpose(x,
-                    (0, 1, 2, 4, 3, 5))  # Move channels to spatial dimensions
-  x = x.reshape(batch_size, new_channels, new_height, new_width)
-
-  return x
-
-
-# aten.sym_stride
-# aten.lt
-@op(torch.ops.aten.lt)
-def _aten_lt(self, other):
-  return self < other
-
-
-def pool(inputs, init, reduce_fn, window_shape, strides, padding):
-  """Helper function to define pooling functions.
-
-  Pooling functions are implemented using the ReduceWindow XLA op.
-  NOTE: Be aware that pooling is not generally differentiable.
-  That means providing a reduce_fn that is differentiable does not imply that
-  pool is differentiable.
-
-  Args:
-    inputs: input data with dimensions (batch, window dims..., features).
-    init: the initial value for the reduction
-    reduce_fn: a reduce function of the form ``(T, T) -> T``.
-    window_shape: a shape tuple defining the window to reduce over.
-    strides: a sequence of ``n`` integers, representing the inter-window
-      strides (default: ``(1, ..., 1)``).
-    padding: either the string ``'SAME'``, the string ``'VALID'``, or a sequence
-      of ``n`` ``(low, high)`` integer pairs that give the padding to apply before
-      and after each spatial dimension.
-  Returns:
-    The output of the reduction for each window slice.
-  """
-  num_batch_dims = inputs.ndim - (len(window_shape) + 1)
-  strides = strides or (1,) * len(window_shape)
-  assert len(window_shape) == len(
-      strides), f'len({window_shape}) must equal len({strides})'
-  strides = (1,) * (1 + num_batch_dims) + strides
-  dims = (1,) * (1 + num_batch_dims) + window_shape
-
-  is_single_input = False
-  if num_batch_dims == 0:
-    # add singleton batch dimension because lax.reduce_window always
-    # needs a batch dimension.
-    inputs = inputs[None]
-    strides = (1,) + strides
-    dims = (1,) + dims
-    is_single_input = True
-
-  assert inputs.ndim == len(dims), f'len({inputs.shape}) != len({dims})'
-  if not isinstance(padding, str):
-    padding = tuple(map(tuple, padding))
-    assert len(padding) == len(window_shape), (
-        f'padding {padding} must specify pads for same number of dims as '
-        f'window_shape {window_shape}')
-    assert all([len(x) == 2 for x in padding
-               ]), f'each entry in padding {padding} must be length 2'
-    padding = ((0, 0), (0, 0)) + padding
-  y = jax.lax.reduce_window(inputs, init, reduce_fn, dims, strides, padding)
-  if is_single_input:
-    y = jnp.squeeze(y, axis=0)
-  return y
-
-
-@op(torch.ops.aten._adaptive_avg_pool3d)
-def _aten_adaptive_avg_pool3d(x, output_shape):
-  return _aten_adaptive_avg_pool(x, output_shape, 3)
-
-
-@op(torch.ops.aten._adaptive_avg_pool2d)
-def _aten_adaptive_avg_pool3d(x, output_shape):
-  return _aten_adaptive_avg_pool(x, output_shape, 2)
-
-
-def _aten_adaptive_avg_pool(x, output_shape, pool_dim):
-
-  def adaptive_kernel_size(input_shape, output_shape):
-    sizes = [1, 1]
-    spatial_dim_off = len(input_shape) - pool_dim
-    for spatial_dim in range(pool_dim):
-      sizes.append(input_shape[spatial_dim_off + spatial_dim] //
-                   output_shape[spatial_dim])
-    return tuple(sizes)
-
-  kernel_sizes = adaptive_kernel_size(x.shape, output_shape)
-  y = pool(x, 0.0, jax.lax.add, kernel_sizes, kernel_sizes, padding='VALID')
-
-  div_shape = list(x.shape)
-  num_batch_dims = len(x.shape) - pool_dim - 1
-  div_shape[num_batch_dims] = 1
-  div_shape = tuple(div_shape)
-  if len(div_shape) - 2 == len(kernel_sizes):
-    div_shape = (1,) + div_shape[1:]
-  y = y / pool(
-      jnp.ones(div_shape), 0.0, jax.lax.add, kernel_sizes, kernel_sizes,
-      'VALID')
-  return y
-
-
-# aten.avg_pool2d
-@op(torch.ops.aten.avg_pool2d)
-@op(torch.ops.aten.avg_pool3d)
-def _aten_avg_pool(inputs,
-                   kernel_size,
-                   strides=None,
-                   padding=0,
-                   ceil_mode=False,
-                   count_include_pad=True,
-                   divisor_override=None):
-
-  num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
-  kernel_size = tuple(kernel_size)
-  strides = tuple(strides)
-  if isinstance(padding, int):
-    padding = tuple((padding, padding) for _ in range(len(kernel_size)))
-  elif isinstance(padding, list):
-    padding = tuple((p, p) for p in padding)
-
-  y = pool(inputs, 0.0, jax.lax.add, kernel_size, strides, padding)
-  if count_include_pad:
-    y = y / np.prod(kernel_size)
-  else:
-    div_shape = list(inputs.shape)
-    div_shape[num_batch_dims] = 1
-    div_shape = tuple(div_shape)
-    if len(div_shape) - 2 == len(kernel_size):
-      div_shape = (1,) + div_shape[1:]
-    y = y / pool(
-        jnp.ones(div_shape), 0.0, jax.lax.add, kernel_size, strides, padding)
-  return y
-
-
-# aten.sym_numel
-# aten.reciprocal
-@op(torch.ops.aten.reciprocal)
-def _aten_reciprocal(a):
-  return 1 / a
-
-
-# aten.scatter
-@op(torch.ops.aten.select_scatter)
-def _aten_select_scatter(input, src, dim, index):
-  input_indexes = []
-  for x in range(len(input.shape)):
-    if x == dim:
-      input_indexes.append(index)
-    else:
-      input_indexes.append(slice(None, None, None))
-  return input.at[tuple(input_indexes)].set(src)
-
-
-@op(torch.ops.aten.scatter.src)
-def _aten_scatter_src(input, dim, index, src, reduce=None):
-  input_index, source_indexes = _scatter_index(dim, index)
-  return input.at[input_index].set(src[source_indexes])
-
-
-@op(torch.ops.aten.scatter.value)
-def _aten_scatter(input, dim, index, src, reduce=None):
-  input_index, source_indexes = _scatter_index(dim, index)
-  return input.at[input_index].set(src)
-
-
-# aten.acosh
-@op(torch.ops.aten.acosh)
-def _aten_acosh(self):
-  return jnp.arccosh(self)
-
-
-# aten.avg_pool2d_backward
-# aten.col2im
-# aten.avg_pool3d
-# aten.round
-@op(torch.ops.aten.round)
-def _aten_round(input, decimals=0):
-  return jnp.round(input, decimals)
-
-
-# aten.max
-@op(torch.ops.aten.max)
-def _aten_max(self, dim=None, keepdim=False):
-  return jnp.max(
-      self, axis=dim, keepdims=keepdim), jnp.argmax(
-          self, axis=dim, keepdims=keepdim)
-
-
-# aten.maximum
-@op(torch.ops.aten.maximum)
-def _aten_maximum(self, other):
-  return jnp.maximum(self, other)
-
-
-# aten.abs
-@op(torch.ops.aten.abs)
-def _aten_abs(self):
-  return jnp.abs(self)
-
-
-# generate aten.amax only
-@op(torch.ops.aten.amax)
-def _aten_amax(self, dim=None, keepdim=False):
-  return jnp.amax(self, axis=dim, keepdims=keepdim)
-
-
-# aten.any
-@op(torch.ops.aten.any)
-def _aten_any(self, dim=None, keepdim=False):
-  return jnp.any(self, axis=dim, keepdims=keepdim)
-
-
-# aten.arange
-@op(torch.ops.aten.arange)
-def _aten_arange(start,
-                 end=None,
-                 step=1,
-                 *,
-                 dtype=None,
-                 layout=None,
-                 requires_grad=False,
-                 device=None,
-                 pin_memory=False):
-  if end is None:
-    end = start
-    start = 0
-  dtype = tensor.t2j_dtype(dtype)
-  return jnp.arange(
-      start,
-      end,
-      step,
-      dtype=dtype,
-  )
-
-
-# aten.argmax
-@op(torch.ops.aten.argmax)
-def _aten_argmax(self, dim=None, keepdim=False):
-  return jnp.argmax(self, axis=dim, keepdims=keepdim)
-
-
-# aten.as_strided
-@op(torch.ops.aten.as_strided)
-@op(torch.ops.aten.as_strided_copy)
-def _aten_as_strided(x, sizes, strides, storage_offset=None):
-  ind = jnp.zeros(sizes, dtype=jnp.int32)
-
-  for i, (size, stride) in enumerate(zip(sizes, strides)):
-    result_shape = (1,) * i + (size,) + (1,) * (len(sizes) - i - 1)
-    indexes = (jnp.arange(size) * stride).reshape(result_shape)
-    ind += indexes
-
-  return jnp.ravel(x)[ind]
-
-
-# aten.atan2
-@op(torch.ops.aten.atan2)
-def _aten_atan2(self, other):
-  return jnp.arctan2(self, other)
-
-
-# aten.bitwise_and
-@op(torch.ops.aten.bitwise_and)
-def _aten_bitwise_and(self, other):
-  return self & other
-
-
-# aten.bitwise_or
-@op(torch.ops.aten.bitwise_or)
-def _aten_bitwise_or(self, other):
-  return self | other
-
-
-# aten.bitwise_xor
-@op(torch.ops.aten.bitwise_xor)
-def _aten_bitwise_xor(self, other):
-  return self ^ other
-
-
-# aten.clamp
-@op(torch.ops.aten.clamp)
-def _aten_clamp(self, min=None, max=None):
-  return jnp.clip(self, min, max)
-
-
-# aten.constant_pad_nd
-@op(torch.ops.aten.constant_pad_nd)
-def _aten_constant_pad_nd(input, padding, value=0):
-  # NOTE: Torch padding is flat and reversed: (1, 1, 2, 2)
-  #  means last dim get padded 1 in front and 1 in back;
-  #  and second last dim get padded 2 in front and 2 in back.
-  # Jax padding tuple of 2-tuple: the same padding is
-  # [(0, 0), ..., (2,2), (1,1)]
-  m = len(padding)
-  rev_padding = [(padding[i - 1], padding[i]) for i in range(m - 1, 0, -2)]
-  pad_dim = tuple(([(0, 0)] * (len(input.shape) - m // 2)) + rev_padding)
-  return jnp.pad(input, pad_dim, mode="constant", constant_values=value)
-
-
-# aten.convolution_backward
-@op(torch.ops.aten.copy)
-@op(torch.ops.aten.lift_fresh_copy)
-def _aten_copy(x):
-  return jnp.copy(x)
-
-
-@op(torch.ops.aten._cdist_forward)
-def _aten_cdist_forward(x1, x2, p, compute_mode=''):
-  # x1 is B x P x M
-  # x2 is B x Q x M
-  # res is B x P x Q
-  x1 = jnp.expand_dims(x1, len(x1.shape) - 1)
-  x2 = jnp.expand_dims(x2, len(x2.shape) - 2)
-  return jnp.linalg.norm(x1 - x2, ord=p, axis=-1)
-
-
-@op(torch.ops.aten._pdist_forward)
-def _aten__pdist_forward(x, p):
-  pairwise_dists = _aten_cdist_forward(x, x, p)
-  condensed_dists = pairwise_dists[jnp.triu_indices(
-      pairwise_dists.shape[0], k=1)]
-  return condensed_dists
-
-
-# aten.cos
-@op(torch.ops.aten.cos)
-def _aten_cos(input):
-  return jnp.cos(input)
-
-
-# aten.cosh
-@op(torch.ops.aten.cosh)
-def _aten_cosh(input):
-  return jnp.cosh(input)
-
-
-# aten.diagonal
-@op(torch.ops.aten.diagonal)
-def _aten_diagonal(input, offset=0, dim1=0, dim2=1):
-  return jnp.diagonal(input, offset, dim1, dim2)
-
-
-# aten.empty_strided
-# aten.eq
-@op(torch.ops.aten.eq)
-def _aten_eq(input1, input2):
-  return input1 == input2
-
-
-# aten.erf
-@op(torch.ops.aten.erf)
-def _aten_erf(x):
-  if x.dtype in (jnp.int32, jnp.int64):
-    x = x.astype(jnp.float32)
-  return jax.lax.erf(x)
-
-
-# aten.exp
-@op(torch.ops.aten.exp)
-def _aten_exp(input):
-  return jnp.exp(input)
-
-
-# aten.expm1
-@op(torch.ops.aten.expm1)
-def _aten_expm1(input):
-  return jnp.expm1(input)
-
-
-# aten.fill
-@op(torch.ops.aten.fill)
-@op(torch.ops.aten.full_like)
-def _aten_fill(x, value, dtype=None, pin_memory=None, memory_format=None):
-  if dtype is None:
-    dtype = x.dtype
-  else:
-    dtype = tensor.t2j_dtype(dtype)
-  return jnp.full(x.shape, value, dtype)
-
-
-# aten.flip
-@op(torch.ops.aten.flip)
-def _aten_flip(input, dims):
-  if dims is not None:
-    return jnp.flip(input, tuple(dims))
-  else:
-    return jnp.flip(input)
-
-
-# aten.floor
-@op(torch.ops.aten.floor)
-def _aten_floor(input):
-  return jnp.floor(input)
-
-
-# aten.fmod
-@op(torch.ops.aten.fmod)
-def _aten_fmod(input, other):
-  return input - other * _aten_div(input, other, 'trunc')
-
-
-# aten.gather
-@op(torch.ops.aten.gather)
-def _aten_gather(input, dim, index):
-  input_indexes, source_indexes = _scatter_index(dim, index)
-  return input[input_indexes]
-
-
-# aten.ge
-@op(torch.ops.aten.ge)
-def _aten_ge(self, other):
-  return self >= other
-
-
-@op(torch.ops.aten.glu)
-@op(torch.ops.aten.glu.default)
-def _aten_glu(x, dim=-1):
-  return jax.nn.glu(x, dim)
-
-
-# aten.hardtanh
-@op(torch.ops.aten.hardtanh)
-def _aten_hardtanh(input, min_val=-1., max_val=1., inplace=False):
-  return jnp.clip(input, min_val, max_val)
-
-
-# aten.isinf
-@op(torch.ops.aten.isinf)
-def _aten_isinf(input):
-  return jnp.isinf(input)
-
-
-# aten.isnan
-@op(torch.ops.aten.isnan)
-def _aten_isnan(input):
-  return jnp.isnan(input)
-
-
-@op(torch.ops.aten.le)
-def _aten_le(self, other):
-  return self <= other
-
-
-# aten.leaky_relu
-@op(torch.ops.aten.leaky_relu)
-def _aten_leaky_relu(x, negative_slope):
-  return jax.nn.leaky_relu(x, negative_slope)
-
-
-# aten.log
-@op(torch.ops.aten.log)
-def _aten_log(x):
-  return jnp.log(x)
-
-
-# aten.log10
-@op(torch.ops.aten.log10)
-def _aten_log10(x):
-  return jnp.log10(x)
-
-
-# aten.log1p
-@op(torch.ops.aten.log1p)
-def _aten_log1p(x):
-  return jnp.log1p(x)
-
-
-# aten.log2
-@op(torch.ops.aten.log2)
-def _aten_log2(x):
-  return jnp.log2(x)
-
-
-# aten.logical_and
-@op(torch.ops.aten.logical_and)
-def _aten_logical_and(self, other):
-  return jnp.logical_and(self, other)
-
-
-# aten.logical_or
-@op(torch.ops.aten.logical_or)
-def _aten_logical_or(self, other):
-  return jnp.logical_or(self, other)
-
-
-# aten.logical_not
-@op(torch.ops.aten.logical_not)
-def _aten_logical_not(self):
-  return jnp.logical_not(self)
-
-
-# aten.log_softmax
-@op(torch.ops.aten._log_softmax)
-def _aten_log_softmax(self, axis=-1, half_to_float=False):
-  return jax.nn.log_softmax(self, axis)
-
-
-# aten.max_pool3d_backward
-# aten.logical_xor
-@op(torch.ops.aten.logical_xor)
-def _aten_logical_xor(self, other):
-  return jnp.logical_xor(self, other)
-
-
-# aten.max_pool2d_with_indices_backward
-# aten.native_dropout
-# aten.native_group_norm_backward
-# aten.neg
-@op(torch.ops.aten.neg)
-def _aten_neg(x):
-  return -1 * x
-
-
-# aten.nonzero
-@op(torch.ops.aten.nonzero)
-def _aten_nonzero(x):
-  index_tuple = jnp.nonzero(x)
-  index_tuple = [jnp.expand_dims(p, -1) for p in index_tuple]
-  return jnp.concatenate(index_tuple, axis=-1)
-
-
-# aten.prod
-
-
-@op(torch.ops.aten.prod)
-def _aten_prod(self, dim=None, keepdim=False):
-  return jnp.prod(self, axis=dim, keepdims=keepdim)
-
-
-# aten.rand
-# aten.randn
-# aten.randperm
-# aten.reflection_pad3d
-# aten.remainder
-@op(torch.ops.aten.remainder)
-def _aten_remainder(inputs, other):
-  return inputs % other
-
-
-# aten.repeat
-@op(torch.ops.aten.repeat)
-def _aten_repeat(x, reps):
-  return jnp.tile(x, reps)
-
-
-# aten.replication_pad2d
-# aten.replication_pad3d
-# aten.roll
-@op(torch.ops.aten.roll)
-def _aten_roll(input, shifts, dims=None):
-  return jnp.roll(input, shifts, dims)
-
-
-# aten.scalar_tensor
-# aten.slice_scatter
-@op(torch.ops.aten.slice_scatter)
-def _aten_slice_scatter(input, src, dim=0, start=None, end=None, step=1):
-  input_index = []
-  for x in range(len(input.shape)):
-    if x == dim:
-      input_index.append(slice(start, end, step))
-    else:
-      input_index.append(slice(None, None, None))
-  return input.at[tuple(input_index)].set(src)
-
-
-# aten.sort
-# torch.sort(input, dim=-1, descending=False, stable=False, *, out=None)
-@op(torch.ops.aten.sort)
-def _aten_sort(a, dim=-1, descending=False, stable=False):
-  return (
-      jnp.sort(a, axis=dim, stable=stable, descending=descending),
-      jnp.argsort(a, axis=dim, stable=stable, descending=descending),
-  )
-
-
-# aten.sym_size
-
-
-# aten.topk
-@op(torch.ops.aten.topk)
-def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None):
-  """JAX top-k implementation using jax.lax.top_k for improved efficiency.
-
-  Args:
-      input: The input JAX array.
-      k: The number of top elements to return.
-      dim: The dimension along which to find the top-k. If None, operates on the
-        flattened array.
-      largest: If True, returns the largest k elements. Otherwise, smallest k.
-      sorted: If True, returns the elements in sorted order.
-
-  Returns:
-      A tuple (values, indices) containing:
-          - values: The top k values.
-          - indices: The indices of the top k values in the original array.
-  """
-  if dim is None:
-    input = input.flatten()
-    dim = 0
-
-  if not largest:
-    input = -input  # Find top-k of negated input if we want the smallest
-
-  transpose_shape = None
-  if dim != -1 and dim != len(input.shape) - 1:
-    transpose_shape = list(range(len(input.shape)))
-    transpose_shape[dim], transpose_shape[-1] = (transpose_shape[-1],
-                                                 transpose_shape[dim])
-    input = jnp.transpose(input, transpose_shape)
-
-  values, indices = jax.lax.top_k(input, k)
-
-  if sorted:
-    values = jnp.sort(values, descending=True)
-    indices = jnp.take_along_axis(
-        indices, jnp.argsort(values, axis=-1, descending=True), axis=-1)
-
-  if not largest:
-    values = -values  # Negate values back if we found smallest
-
-  if transpose_shape is not None:
-    values = jnp.transpose(values, transpose_shape)
-    indices = jnp.transpose(indices, transpose_shape)
-
-  return values, indices
-
-
-# aten.trunc
-@op(torch.ops.aten.trunc)
-def _aten_trunc(a):
-  return jnp.trunc(a)
-
-
-@op(torch.ops.aten.unbind)
-@op(torch.ops.aten.unbind_copy)
-def _aten_unbind(a, dim=0):
-  return tuple(
-      _aten_squeeze_dim(jax.lax.index_in_dim(a, i, axis=dim), dim)
-      for i in range(a.shape[dim]))
-
-
-# NOTE: skip aten.upsample_nearest2d and aten.upsample_bilinear2d
-# despite those being core aten ops, they also have decompositions.
-# here we are using torch decompositions.
-
-
-# aten.where
-@op(torch.ops.aten.where)
-def _aten_where(condition, x, y):
-  return jnp.where(condition, x, y)
-
-
-# aten.to.dtype
-#Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None
-@op(torch.ops.aten.to.dtype)
-def _aten_to_dtype(a,
-                   dtype,
-                   non_blocking=False,
-                   copy=False,
-                   memory_format=None):
-  jaxdtype = tensor.t2j_dtype(dtype)
-  return a.astype(jaxdtype)
-
-
-# aten.to.device
-
-
-#Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False
-@op(torch.ops.aten.var_mean.correction)
-def _aten_var_mean_correction(self, dim=None, correction=None, keepdim=False):
-  return (jnp.var(self, axis=dim, ddof=correction,
-                  keepdims=keepdim), jnp.mean(self, dim, keepdims=keepdim))
-
-
-@op(torch.ops.aten.scalar_tensor)
-def _aten_scalar_tensor(s,
-                        dtype=None,
-                        layout=None,
-                        device=None,
-                        pin_memory=None):
-  if dtype is not None:
-    dtype = tensor.t2j_dtype(dtype)
-    return jnp.array(s, dtype=dtype)
-  return jnp.array(s)
-
-
-@op(torch.ops.aten.to.device)
-def _aten_to_device(x,device, dtype):
-  return x
-
-
-@op(torch.ops.aten.max_pool2d_with_indices_backward)
-def max_pool2d_with_indices_backward_custom(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices):
-
-  """
-  Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward.
-
-  Args:
-      grad_output: The gradient tensor from the preceding layer.
-      self: The input tensor on which the original max pooling was performed.
-      kernel_size: The size of the pooling window.
-      stride: The stride of the pooling window.
-      padding: The padding applied during max pooling.
-      dilation: The dilation factor for the pooling operation.
-      ceil_mode: Whether to use ceil or floor when calculating output shapes.
-      indices: The indices of the maximum values, as produced by max_pool2d_with_indices.
-
-  Returns:
-      The calculated gradient with respect to the input (grad_input).
-  """
-
-  kH, kW = kernel_size
-  dH, dW = stride
-  padH, padW = padding
-  dilH, dilW = dilation
-
-  # Calculate output shape (may need adjustment based on ceil_mode)
-  out_shape = jnp.array(self.shape)
-  grad_input = jnp.zeros_like(self)
-
-  # Iterate over the flattened input and output tensors
-  for i, idx in enumerate(indices.flatten()):
-    # Calculate input coordinates corresponding to the maximum value
-    out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3]
-    in_y = out_y * dH - padH + out_y * (dilH - 1)
-    in_x = out_x * dW - padW + out_x * (dilW - 1)
-
-    # Scatter the gradient to the appropriate input locations (handling potential overlaps)
-    for y in range(in_y, in_y + kH):
-      for x in range(in_x, in_x + kW):
-        if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]:
-          grad_input = grad_input.at[y, x].add(grad_output.flatten()[i])  
-
-  return grad_input
-
-
-@op(torch.ops.aten._local_scalar_dense)
-def _aten_local_scalar_dense(x):
-  return x.item()
-
-@op(torch.ops.aten.tensor_split.sections)
-def _aten_tensor_split(ary, indices_or_sections, axis=0):
-  return jnp.array_split(ary, indices_or_sections, axis)
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/decompositions.py b/experimental/torch_xla2/torch_xla2/decompositions.py
index e85e49e13ee..81b48bb5da8 100644
--- a/experimental/torch_xla2/torch_xla2/decompositions.py
+++ b/experimental/torch_xla2/torch_xla2/decompositions.py
@@ -90,4 +90,21 @@ def _reflection_or_replication_pad(
     return result
 
 _try_register(aten.replication_pad1d, _replication_pad)
-_try_register(aten.replication_pad3d, _replication_pad)
\ No newline at end of file
+_try_register(aten.replication_pad3d, _replication_pad)
+
+EXTRA_DECOMP = decomp.get_decompositions([
+    torch.ops.aten.upsample_nearest2d,
+    torch.ops.aten._native_batch_norm_legit.no_stats,
+    torch.ops.aten._adaptive_avg_pool2d,
+    torch.ops.aten._adaptive_avg_pool3d,
+    torch.ops.aten.grid_sampler_2d,
+    torch.ops.aten.native_dropout,
+    torch.ops.aten.reflection_pad1d,
+    torch.ops.aten.reflection_pad2d,
+    torch.ops.aten.reflection_pad3d,
+    torch.ops.aten.replication_pad1d,
+    torch.ops.aten.replication_pad2d,
+    torch.ops.aten.replication_pad3d,
+])
+
+EXTRA_DECOMP[torch.ops.aten.uniform] = torch.ops.aten.rand
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/environment.py b/experimental/torch_xla2/torch_xla2/environment.py
index 6a71c7d51c0..139597f9cb0 100644
--- a/experimental/torch_xla2/torch_xla2/environment.py
+++ b/experimental/torch_xla2/torch_xla2/environment.py
@@ -1,26 +1,2 @@
-import jax
-
-
-class Environment:
-    """This class holds a set of configurations and "globals" needed
-
-    for executing torch program using jax.
-    Things included so far:
-
-    op registry
-    PRNGKey
-    Configs
-
-    Also helper functions to manipulate those.
-    """
-
-    _prng_key: jax.random.PRNGKey
-
-
-    def __init__(self, random_seed):
-        self._prng_key = jax.random.PRNGKey(random_seed)
-
-    def get_and_rotate_prng_key(self):
-        self._prng_key, key = jax.random.split(self._prng_key)
 
 
diff --git a/experimental/torch_xla2/torch_xla2/export.py b/experimental/torch_xla2/torch_xla2/export.py
index 64a3f9d175c..387d9889386 100644
--- a/experimental/torch_xla2/torch_xla2/export.py
+++ b/experimental/torch_xla2/torch_xla2/export.py
@@ -2,145 +2,13 @@
 """Utilities for exporting a torch program to jax/stablehlo."""
 import copy
 from typing import Any, Dict, Tuple
-import jax
 import torch
-from torch.fx import _pytree as fx_pytree
-from torch_xla2 import ops_registry, tensor
 from torch.utils import _pytree as pytree
-
-
-class JaxProgram:
-
-  def _wrap_inputs(self, xs, allow_torch_tensor=False):
-
-    def convert(t):
-      if isinstance(t, tensor.XLATensor2):
-        return t
-      if isinstance(t, torch.Tensor):
-        if allow_torch_tensor:
-          return tensor.move_to_device(t)
-        else:
-          raise ValueError('Regular torch.Tensor is not allowed.')
-      if isinstance(t, jax.Array):
-        return tensor.XLATensor2(t)
-      return t
-
-    return jax.tree_util.tree_map(convert, xs)
-
-  def _unwrap_outputs(self, xs):
-
-    def convert(t):
-      if isinstance(t, tensor.XLATensor2):
-        return t.jax()
-      if isinstance(t, torch.Tensor):
-        raise ValueError('Regular torch.Tensor is not allowed.')
-      return t
-
-    return jax.tree_util.tree_map(convert, xs)
-
-  def __init__(
-      self,
-      exported_program,
-      param_buffer_values,
-      ordered_tensor_constants,
-  ):
-
-    self.param_buffer_values = self._wrap_inputs(
-        param_buffer_values, allow_torch_tensor=True)
-    self.ordered_tensor_constants = self._wrap_inputs(
-        ordered_tensor_constants, allow_torch_tensor=True)
-    self.exported_program = exported_program
-
-  def __hash__(self):
-    return hash(self.exported_program)
-
-  @property
-  def example_inputs(self):
-    args, kwargs = self.exported_program.example_inputs
-    args = pytree.tree_map(tensor.t2j, args)
-    kwargs = pytree.tree_map(tensor.t2j, kwargs)
-    return args, kwargs
-
-  def flatten_inputs(self, args, kwargs):
-    if args is None:
-      args = tuple()
-    if kwargs is None:
-      kwargs = {}
-
-    if (in_spec := self.exported_program.call_spec.in_spec) is not None:
-      if (in_spec.type == tuple and len(in_spec.children_specs) == 2 and
-          in_spec.children_specs[0].type == tuple and
-          in_spec.children_specs[1].type == dict):
-        # NOTE: this is the case where in_spec is for both args and kwargs
-        return fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
-      return fx_pytree.tree_flatten_spec(args, in_spec)
-    return copy.deepcopy(args)
-
-  def unflatten_outputs(self, res):
-    return pytree.tree_unflatten(res, self.exported_program.call_spec.out_spec)
-
-  def __call__(self, *args, **kwargs):
-
-    inputs = self.flatten_inputs(args, kwargs)
-    res = self.flatten_callable(*inputs)
-    res = self.unflatten_outputs(res)
-
-    return res
-
-  @property
-  def flatten_callable(self):
-
-    def func(*inputs: jax.Array):
-      nonlocal self
-      inputs = self._wrap_inputs(inputs)
-      num_mutations = len(
-          self.exported_program.graph_signature.buffers_to_mutate)
-      res = torch.fx.Interpreter(self.exported_program.graph_module).run(
-          *self.param_buffer_values,
-          *inputs,
-          *self.ordered_tensor_constants,
-          enable_io_processing=False,
-      )
-      res = res[num_mutations:]
-      res = self._unwrap_outputs(res)
-      return res
-
-    return func
-
-  def jit(self, *args, **kwargs):
-    """Returns `jax.jit(self, *args, **kwargs)`."""
-    return jax.jit(self, *args, **kwargs)
-
-  def jit_lower(self, *args, **kwargs):
-    """Returns `jax.jit(self, *args, **kwargs).lower(...)` with example_inputs used in export."""
-    example_args, example_kwargs = self.example_inputs
-    return self.jit(*args, **kwargs).lower(*example_args, **example_kwargs)
-
-
-def exported_program_to_jax_program(ep):
-  """exported_program_to_jax_program.
-
-  Args:
-    ep: torch.export.ExportedProgram
-
-  Returns:
-    JaxProgram
-
-  """
-  if torch.__version__ >= '2.2':
-    ep = ep.run_decompositions()
-
-  param_buffer_keys = ep.graph_signature.parameters + ep.graph_signature.buffers
-  param_buffer_values = tuple(ep.state_dict[key] for key in param_buffer_keys)
-
-  if hasattr(ep.graph_signature, 'lifted_tensor_constants'):
-    ordered_tensor_constants = tuple(
-        ep.tensor_constants[name]
-        for name in ep.graph_signature.lifted_tensor_constants)
-  else:
-    ordered_tensor_constants = tuple()
-
-  return JaxProgram(ep, param_buffer_values, ordered_tensor_constants)
+from torch_xla2 import tensor
+from torch_xla2.ops import ops_registry
+import jax
+import jax.numpy as jnp
+import sympy
 
 
 DEBUG = False
@@ -149,6 +17,11 @@ def exported_program_to_jax_program(ep):
 class JaxInterpreter(torch.fx.Interpreter):
   """Experimental."""
 
+  def __init__(self, graph_module):
+    super().__init__(graph_module)
+    import torch_xla2.ops.jaten
+    import torch_xla2.ops.jtorch
+
   def call_function(self, target, args: Tuple, kwargs: Dict) -> Any:
     if not isinstance(target,
                       (torch._ops.OpOverloadPacket, torch._ops.OpOverload)):
@@ -157,7 +30,9 @@ def call_function(self, target, args: Tuple, kwargs: Dict) -> Any:
     if DEBUG:
       print('Running ', target.name(), '--------')
 
-    op = ops_registry.lowerings.lookup(target)
+    op = ops_registry.all_aten_ops.get(target)
+    if op is None:
+      op = ops_registry.all_aten_ops.get(target.overloadpacket)
     if op is None:
       print(target.name(), target.tags)
       raise RuntimeError('No lowering found for', target.name())
@@ -231,3 +106,125 @@ def func(states, inputs):
 
   states = pytree.tree_map_only(torch.Tensor, tensor.t2j, states)
   return states, func
+
+
+def extract_avals(exported):
+  """Return JAX Abstract Value shapes for all input parameters of the exported
+  program. This supports dynamic batch dimensions, including with constraints.
+  """
+
+  def _to_aval(arg_meta, symbolic_shapes):
+    """Convet from torch type to jax abstract value for export tracing
+    """
+    def _get_dim(d):
+      if isinstance(d, torch.SymInt):
+        return symbolic_shapes[str(d)]
+      return d
+
+    val = arg_meta['val']
+    is_scalar = isinstance(val, float) or isinstance(val, int) or isinstance(val, bool)
+    if is_scalar:
+      return jax.ShapeDtypeStruct([], type(arg_meta['val']))
+
+    tensor_meta = arg_meta['tensor_meta']
+    shape = [_get_dim(d) for d in tensor_meta.shape]
+    return jax.ShapeDtypeStruct(shape, tensor.t2j_dtype(tensor_meta.dtype))
+
+  def _get_inputs(exported):
+    """Return placeholders with input metadata"""
+    placeholders = [p for p in exported.graph.nodes if p.op == "placeholder"]
+    input_placeholders = [
+      p
+      for p, s in zip(placeholders, exported.graph_signature.input_specs)
+      if s.kind == torch.export.graph_signature.InputKind.USER_INPUT
+    ]
+    return input_placeholders
+
+  def _build_symbolic_shapes(range_constraints):
+    """Convert torch SymInt to JAX symbolic_shape and stores in a map using the
+    string name of the torch symbolic int.
+ 
+    TODO: There is probably a better way of storing a key for a symbolic int.
+    This value needs to be looked up again in `_to_aval` to figure out which
+    JAX symbolic to map to for a given torch tensor.
+    """
+    if len(range_constraints) == 0:
+      return None
+
+    def _build_symbolic_constraints(symbol_name, torch_constraint):
+      """Convert torch SymInt constraints to string for JAX symbolic_shape
+      Using sympy may be overkill here, currently PyTorch only uses ValueRanges
+      which allow specifying the min and the max of a value, for example:
+        torch.export.Dim("a", min=5, max=10)
+         ==> ("a >= 5", "a <= 10",)
+      """
+      if not isinstance(torch_constraint, torch.utils._sympy.value_ranges.ValueRanges) or torch_constraint.is_bool:
+        raise TypeError(f"No symbolic constraint handler for: {torch_constraint}")
+
+      constraints = []
+      symbol = sympy.Symbol(symbol_name)
+      if torch_constraint.lower != 2:
+        constraints.append(symbol >= torch_constraint.lower)
+      if not torch_constraint.upper.is_infinite:
+        constraints.append(symbol <= torch_constraint.upper)
+      
+      return tuple(sympy.pretty(c, use_unicode=False) for c in constraints)
+
+    def _build_symbolic_shape(sym, constraint, free_symbols):
+      """Returns a JAX symbolic shape for a given symbol and constraint
+
+      There are two possible sympy `sym` inputs:
+        1. Symbol - (s0) These can have custom constraints.
+        2. Expr - (s0*2) These apply the expr to s0's constraints, cannot override.
+      
+        Currently support is limited to operations with a symbol and and int,
+        in `torch/export/dynamic_shapes.py`:
+        "Only increasing linear operations with integer coefficients are supported."
+      """
+      symbol_name = str(sym)
+      constraints = _build_symbolic_constraints(symbol_name, constraint)
+      if sym.is_symbol:
+        symbolic_shape = jax.experimental.export.symbolic_shape(symbol_name, constraints=constraints)
+      else:
+        assert len(sym.free_symbols) > 0
+        scope = free_symbols[str(list(sym.free_symbols)[0])].scope
+        symbolic_shape = jax.experimental.export.symbolic_shape(symbol_name, scope=scope)
+      assert len(symbolic_shape) == 1
+      return symbolic_shape[0]
+
+    # Populate symbol variables before expressions, exprs need to use the same
+    # Symbolic scope as the variable they operate on. Expressions can only be
+    # integer compuations on symbol variables, so each symbol variable is OK to
+    # have its own scope.
+    symbolic_shapes = {}
+    symbol_variables = [(s,v) for s,v in range_constraints.items() if s.is_symbol]
+    symbol_exprs = [(s,v) for s,v in range_constraints.items() if not s.is_symbol]
+    for sym, constraint in symbol_variables + symbol_exprs:
+      symbolic_shape = _build_symbolic_shape(sym, constraint, symbolic_shapes)
+      symbolic_shapes[str(sym)] = symbolic_shape
+    return symbolic_shapes
+
+  symbolic_shapes = _build_symbolic_shapes(exported.range_constraints)
+  args = _get_inputs(exported)
+
+  if DEBUG:
+    print('Inputs to aval:', args, '--------')
+    print('Symbolic shapes:', symbolic_shapes)
+    for arg in args:
+      print('Meta2Aval', arg.meta, '--> ', _to_aval(arg.meta, symbolic_shapes))
+
+  return [_to_aval(arg.meta, symbolic_shapes) for arg in args]
+
+
+def exported_program_to_stablehlo(exported_program):
+  """Replacement for torch_xla.stablehlo.exported_program_to_stablehlo
+
+  Convert a program exported via torch.export to StableHLO.
+
+  This supports dynamic dimension sizes and generates explicit checks for
+  dynamo guards in the IR using shape_assertion custom_call ops.
+  """
+  weights, func = exported_program_to_jax(exported_program)
+  jax_avals = extract_avals(exported_program)
+  jax_export = jax.experimental.export.export(func)(weights, (jax_avals,))
+  return jax_export
diff --git a/experimental/torch_xla2/torch_xla2/extra.py b/experimental/torch_xla2/torch_xla2/extra.py
deleted file mode 100644
index ebfdb96b1db..00000000000
--- a/experimental/torch_xla2/torch_xla2/extra.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import jax
-import jax.numpy as jnp
-import functools
-import torch
-from torch.utils import _pytree as pytree
-from torch_xla2 import tensor
-
-def torch_view(t):
-    # t is an object from jax land
-    # view it as-if it's a torch land object
-    if isinstance(t, jax.Array):
-        return tensor.XLATensor2(t)
-    if isinstance(t, type(jnp.int32)):
-        return tensor.t2j_type(t)
-    if callable(t):
-        def new_t(*args, **kwargs):
-            # args, kwargs are torch-land 
-            args, kwargs = pytree.tree_map(jax_view, (args, kwargs))
-            # now they are objs in jax-land
-            res = t(*args, **kwargs) # t is jax callable
-            # res is jax-land obj
-            return pytree.tree_map(torch_view, res)
-        return new_t
-    # regular types are not changed
-    return t
-
-
-def jax_view(t):
-    # t is an object from torch land
-    # view it as-if it's a jax land object
-    if isinstance(t, torch.Tensor):
-        assert isinstance(t, tensor.XLATensor2)
-        return t.jax()
-    if isinstance(t, type(torch.int32)):
-        return tensor.j2t_dtype(t)
-    if callable(t):
-        def new_t(*args, **kwargs):
-            # args, kwargs are jax-land
-            args, kwargs = pytree.tree_map(torch_view, (args, kwargs))
-            # now they are objs in torch-land
-            res = t(*args, **kwargs)
-            # res is torch-land obj
-            return pytree.tree_map(jax_view, res)
-        return new_t
-    # regular types are not changed
-    return t
-
-def call_jax(jax_func, *args, **kwargs):
-    return torch_view(jax_func)(*args, **kwargs)
-
-
-def call_torch(torch_func, *args, **kwargs):
-    return jax_view(torch_func)(*args, **kwargs)
-
-
-fori_loop = torch_view(jax.lax.fori_loop)
-
-def jax_jit(torch_function, kwargs_for_jax_jit=None):
-    kwargs_for_jax_jit = kwargs_for_jax_jit or {}
-    jax_func = jax_view(torch_function)
-    jitted = jax.jit(jax_func, **kwargs_for_jax_jit)
-    return torch_view(jitted)
diff --git a/experimental/torch_xla2/torch_xla2/functions.py b/experimental/torch_xla2/torch_xla2/functions.py
deleted file mode 100644
index 9fcd5653a86..00000000000
--- a/experimental/torch_xla2/torch_xla2/functions.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""Tensor constructor overrides"""
-import functools
-import logging
-from typing import Callable, Optional, ParamSpec, Sequence
-
-import jax
-import torch
-import jax.numpy as jnp
-from torch_xla2 import tensor
-
-registry = {}
-
-P = ParamSpec('P')
-
-
-def register_function(torch_func: Callable[P, torch.Tensor]):
-  """Registers a function as the JAX implementation of a torch function."""
-
-  def decorator(jax_impl: Callable[P, jax.Array]):
-    registry[torch_func] = jax_impl
-    return jax_impl
-
-  return decorator
-
-
-def convert_dtype(use_default_dtype: bool = True):
-  """Converts `dtype` kwarg of function from torch to JAX.
-
-  Args:
-    use_default_dtype: Whether to use torch default dtype if none is provided.
-
-  Returns:
-    A decorator that wraps a JAX implementation of a torch function.
-  """
-
-  def decorator(func: Callable[P, torch.Tensor]):
-
-    @functools.wraps(func)
-    def wrapper(*args: P.args,
-                dtype: Optional[torch.dtype] = None,
-                **kwargs: P.kwargs):
-      if not dtype and use_default_dtype:
-        dtype = torch.get_default_dtype()
-      jax_dtype = tensor.t2j_dtype(dtype)
-
-      return func(*args, dtype=jax_dtype, **kwargs)
-
-    return wrapper
-
-  return decorator
-
-
-@register_function(torch.tensor)
-@convert_dtype(use_default_dtype=False)  # Attempt to infer type from elements
-def _tensor(data, *, dtype=None, **kwargs):
-  python_types_to_torch_types = {
-      bool: jnp.bool,
-      int: jnp.int64,
-      float: jnp.float32,
-      complex: jnp.complex64,
-  }
-  if not dtype:
-    leaves = jax.tree_util.tree_leaves(data)
-    if len(leaves) > 0:
-      dtype = python_types_to_torch_types.get(type(leaves[0]))
-
-  return jnp.array(
-      data, dtype=dtype or tensor.t2j_dtype(torch.get_default_dtype()))
-
-
-@register_function(torch.ones)
-@convert_dtype()
-def _ones(*size: int, dtype=None, **kwargs):
-  return jnp.ones(size, dtype)
-
-
-@register_function(torch.zeros)
-@convert_dtype()
-def _zeros(*size: int, dtype=None, **kwargs):
-  return jnp.zeros(size, dtype)
-
-
-@register_function(torch.eye)
-@convert_dtype()
-def _eye(n: int, m: Optional[int] = None, *, dtype=None, **kwargs):
-  return jnp.eye(n, m, dtype=dtype)
-
-
-@register_function(torch.full)
-@convert_dtype()
-def _full(size: Sequence[int], fill_value, *, dtype=None, **kwargs):
-  # TODO: handle torch.Size
-  return jnp.full(size, fill_value, dtype=dtype)
-
-
-class XLAFunctionMode(torch.overrides.TorchFunctionMode):
-  """Context manager that dispatches torch function calls to JAX."""
-
-  def __torch_function__(self,
-                         func,
-                         types,
-                         args=(),
-                         kwargs=None) -> torch.Tensor:
-    jax_func = registry.get(func)
-    if not jax_func:
-      return func(*args, **(kwargs or {}))
-
-    # TODO: unwrap args here or in implementations?
-    return tensor.wrap(jax_func(*args, **(kwargs or {})))
diff --git a/experimental/torch_xla2/torch_xla2/interop.py b/experimental/torch_xla2/torch_xla2/interop.py
new file mode 100644
index 00000000000..d1a96179e82
--- /dev/null
+++ b/experimental/torch_xla2/torch_xla2/interop.py
@@ -0,0 +1,69 @@
+import functools
+import torch
+import jax
+import jax.numpy as jnp
+from jax import tree_util as pytree
+from torch_xla2 import tensor
+import torch_xla2
+
+from torch_xla2.types import JaxValue, TorchValue, JaxCallable, TorchCallable
+
+
+
+def _torch_view(t: JaxValue) -> TorchValue:
+    # t is an object from jax land
+    # view it as-if it's a torch land object
+    if isinstance(t, jax.Array):
+        # TODO
+        return tensor.XLATensor2(t, torch_xla2.default_env())
+    if isinstance(t, type(jnp.int32)):
+        return tensor.t2j_type(t)
+    if callable(t):  # t is a JaxCallable
+        return functools.partial(call_jax, t)
+    # regular types are not changed
+    return t
+
+torch_view = functools.partial(pytree.tree_map, _torch_view)
+
+
+def _jax_view(t: TorchValue) -> JaxValue:
+    # t is an object from torch land
+    # view it as-if it's a jax land object
+    if isinstance(t, torch.Tensor):
+        assert isinstance(t, tensor.XLATensor2)
+        return t.jax()
+    if isinstance(t, type(torch.int32)):
+        return tensor.j2t_dtype(t)
+
+    # torch.nn.Module needs special handling
+    if not isinstance(t, torch.nn.Module) and callable(t):  # t is a TorchCallable
+        return functools.partial(call_torch, t)
+    # regular types are not changed
+    return t
+
+jax_view = functools.partial(pytree.tree_map, _jax_view)
+
+
+def call_jax(jax_func: JaxCallable, 
+             *args: TorchValue, 
+             **kwargs: TorchValue) -> TorchValue:
+    args, kwargs = jax_view((args, kwargs))
+    res: JaxValue = jax_func(*args, **kwargs)
+    return torch_view(res)
+
+
+def call_torch(torch_func: TorchCallable, *args: JaxValue, **kwargs: JaxValue) -> JaxValue:
+    args, kwargs = torch_view((args, kwargs))
+    with torch_xla2.default_env():
+        res: TorchValue = torch_func(*args, **kwargs)
+    return jax_view(res)
+
+
+fori_loop = torch_view(jax.lax.fori_loop)
+
+def jax_jit(torch_function, kwargs_for_jax_jit=None):
+    kwargs_for_jax_jit = kwargs_for_jax_jit or {}
+    jax_func = jax_view(torch_function)
+    jitted = jax.jit(jax_func, **kwargs_for_jax_jit)
+    return torch_view(jitted)
+
diff --git a/experimental/torch_xla2/torch_xla2/ops/__init__.py b/experimental/torch_xla2/torch_xla2/ops/__init__.py
index e69de29bb2d..abefc8344b1 100644
--- a/experimental/torch_xla2/torch_xla2/ops/__init__.py
+++ b/experimental/torch_xla2/torch_xla2/ops/__init__.py
@@ -0,0 +1,9 @@
+def all_aten_jax_ops():
+    # to load the ops
+    import torch_xla2.jaten # type: ignore
+    import torch_xla2.ops_registry # type: ignore
+    return {
+        key: val.func 
+        for key, val in torch_xla2.ops_registry.all_aten_ops
+        if val.is_jax_function
+    }
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/ops/jaten.py b/experimental/torch_xla2/torch_xla2/ops/jaten.py
index a30fae82de8..c5ca628908f 100644
--- a/experimental/torch_xla2/torch_xla2/ops/jaten.py
+++ b/experimental/torch_xla2/torch_xla2/ops/jaten.py
@@ -1,5 +1,14 @@
-"""This module contains implementation of ATen ops."""
+"""Torch ops implemented using jax."""
+
+import sys
+
+import jax
+from jax import numpy as jnp
+import numpy as np
 import torch
+from torch_xla2.ops import ops_registry
+from torch_xla2 import tensor
+from torch_xla2.ops import op_base
 
 # Keys are OpOverload, value is a callable that takes
 # XLATensor2
@@ -9,29 +18,1980 @@
 # and need to be implemented in jax
 
 mutation_ops_to_functional = {
-    torch.ops.aten.add_: torch.ops.aten.add,
-    torch.ops.aten.sub_: torch.ops.aten.sub,
-    torch.ops.aten.mul_: torch.ops.aten.mul,
-    torch.ops.aten.div_: torch.ops.aten.div,
-    torch.ops.aten.pow_: torch.ops.aten.pow,
-    torch.ops.aten.lt_: torch.ops.aten.lt,
-    torch.ops.aten.le_: torch.ops.aten.le,
-    torch.ops.aten.gt_: torch.ops.aten.gt,
-    torch.ops.aten.ge_: torch.ops.aten.ge,
-    torch.ops.aten.eq_: torch.ops.aten.eq,
-    torch.ops.aten.ne_: torch.ops.aten.ne,
+  torch.ops.aten.add_: torch.ops.aten.add,
+  torch.ops.aten.sub_: torch.ops.aten.sub,
+  torch.ops.aten.mul_: torch.ops.aten.mul,
+  torch.ops.aten.div_: torch.ops.aten.div,
+  torch.ops.aten.pow_: torch.ops.aten.pow,
+  torch.ops.aten.lt_: torch.ops.aten.lt,
+  torch.ops.aten.le_: torch.ops.aten.le,
+  torch.ops.aten.gt_: torch.ops.aten.gt,
+  torch.ops.aten.ge_: torch.ops.aten.ge,
+  torch.ops.aten.eq_: torch.ops.aten.eq,
+  torch.ops.aten.ne_: torch.ops.aten.ne,
+  torch.ops.aten.uniform_: torch.ops.aten.uniform,
+  torch.ops.aten.relu_: torch.ops.aten.relu,
 }
 
 
 def make_mutation(op):
+  return op_base.InplaceOp(mutation_ops_to_functional[op], position_to_mutate=0)
 
-  def f(*args, **kwargs):
-    res = mutation_ops_to_functional[op](*args, **kwargs)
-    args[0].copy_(res)
-    return args[0]
 
-  return f
+for op in mutation_ops_to_functional.keys():
+  ops_registry.register_torch_dispatch_op(
+    op, make_mutation(op), is_jax_function=False
+  )
 
 
-for op in mutation_ops_to_functional.keys():
-  all_ops[op] = make_mutation(op)
+def op(*aten, **kwargs):
+  def inner(func):
+    for a in aten:
+      ops_registry.register_torch_dispatch_op(a, func, **kwargs)
+    return func
+
+  return inner
+
+
+@op(
+  torch.ops.aten.view_copy,
+  torch.ops.aten.view,
+  torch.ops.aten._unsafe_view,
+  torch.ops.aten.reshape,
+)
+def _aten_unsafe_view(x, shape):
+  return jnp.reshape(x, shape)
+
+
+@op(torch.ops.aten.add.Tensor)
+@op(torch.ops.aten.add.Scalar)
+def _aten_add(x, y, *, alpha=1):
+  """if isinstance(x, jnp.ndarray) and isinstance(y, jnp.ndarray):
+
+  assert x.dtype == y.dtype, (x.dtype, y.dtype)
+  """
+  return x + y * alpha
+
+
+@op(torch.ops.aten.copy_, torch.ops.aten.copy_.default, is_jax_function=False)
+def _aten_copy(x, y, memory_format=None):
+  if isinstance(x, tensor.XLATensor2):
+    x._elem = y._elem
+  elif isinstance(x, tensor.SliceView):
+    x.mutate(y)
+  return x
+
+
+@op(torch.ops.aten.clone)
+@op(torch.ops.aten.clone.default)
+def _aten_clone(x, memory_format=None):
+  return jnp.copy(x)
+
+
+@op(torch.ops.aten.full)
+def _aten_full(size, value, **kwargs):
+  return jnp.full(size, value)
+
+
+@op(torch.ops.aten.index_copy)
+def _aten_index_copy(x, dim, indexes, source):
+  # return jax.lax.scatter(x, index, dim)
+  dims = []
+  for i in range(len(x.shape)):
+    if i == dim:
+      dims.append(indexes)
+    else:
+      dims.append(slice(None, None, None))
+  return x.at[dim].set(source)
+
+
+@op(torch.ops.aten.select)
+@op(torch.ops.aten.index_select)
+@op(torch.ops.aten.select_copy)
+def _aten_index_select(x, dim, indexes):
+  dims = []
+  for i in range(len(x.shape)):
+    if i == dim:
+      dims.append(indexes)
+    else:
+      dims.append(slice(None, None, None))
+  return x[tuple(dims)]
+
+
+@op(torch.ops.aten.mean)
+def _aten_mean(x, dim=None, keepdim=False):
+  return jnp.mean(x, dim, keepdims=keepdim)
+
+
+def _torch_binary_scalar_type(scalar, tensor):
+  if "float" in str(tensor.dtype):
+    return tensor.dtype
+
+  if isinstance(scalar, int):
+    if "int" in str(tensor.dtype):
+      return tensor.dtype
+
+  return jnp.float32
+
+
+@op(torch.ops.aten.sub.Tensor)
+@op(torch.ops.aten.sub.Scalar)
+def _aten_sub(x, y):
+  if isinstance(x, float):
+    dtype = _torch_binary_scalar_type(x, y)
+    x = jnp.array(x, dtype=dtype)
+  if isinstance(y, float):
+    dtype = _torch_binary_scalar_type(y, x)
+    y = jnp.array(y, dtype=dtype)
+  return x - y
+
+
+@op(torch.ops.aten.mm)
+def _aten_mm(x, y):
+  res = x @ y
+  return res
+
+
+@op(torch.ops.aten.mul.Tensor, torch.ops.aten.mul.Scalar)
+def _aten_mul(x, y):
+  return x * y
+
+
+@op(torch.ops.aten.silu)
+def _aten_silu(x):
+  return jax.nn.silu(x)
+
+
+@op(torch.ops.aten.t)
+def _aten_t(x):
+  return jnp.transpose(x)
+
+
+@op(torch.ops.aten.transpose)
+@op(torch.ops.aten.transpose_copy)
+def _aten_transpose(x, dim0, dim1):
+  shape = list(range(len(x.shape)))
+  shape[dim0], shape[dim1] = shape[dim1], shape[dim0]
+  return jnp.transpose(x, shape)
+
+
+@op(torch.ops.aten.triu)
+def _aten_triu(m, k):
+  return jnp.triu(m, k)
+
+
+@op(torch.ops.aten.slice)
+@op(torch.ops.aten.slice_copy)
+def _aten_slice(self, dim=0, start=None, end=None, step=1):
+  if end == sys.maxsize:
+    end = self.shape[dim]
+  sl = slice(start, end, step)
+  dims = []
+  for i in range(len(self.shape)):
+    if i == dim:
+      dims.append(sl)
+    else:
+      dims.append(slice(None, None, None))
+  return self[tuple(dims)]
+
+
+@op(torch.ops.aten.detach)
+def _aten_detach(self):
+  return self
+
+
+@op(torch.ops.aten.view_as_real)
+def _aten_view_as_real(x):
+  real = jnp.real(x)
+  im = jnp.imag(x)
+  res = jnp.stack([real, im], -1)
+  return res
+
+
+@op(torch.ops.aten.stack)
+def _aten_stack(tensors, dim=0):
+  return jnp.stack(tensors, dim)
+
+
+@op(torch.ops.aten._softmax)
+def _aten_softmax(x, dim, halftofloat):
+  return jax.nn.softmax(x, dim)
+
+
+@op(torch.ops.aten.pow)
+def _aten_pow(x, y):
+  if isinstance(y, int):
+    y = float(y)
+  return jnp.power(x, y)
+
+
+@op(torch.ops.aten.view_as_complex)
+def _aten_view_as_complex(input):
+  if input.dtype == jnp.bfloat16:
+    input = input.astype(jnp.float32)
+  x, y = input[..., 0], input[..., 1]
+  return jax.lax.complex(x, y)
+
+
+@op(torch.ops.aten.div)
+def _aten_div(x, y, rounding_mode=""):
+  res = x / y
+  if rounding_mode == "trunc":
+    res = jnp.trunc(res)
+  return res
+
+
+@op(torch.ops.aten.div_, is_jax_function=False)
+def _aten_div_(x, y, rounding_mode=""):
+  x._elem = _aten_div(x._elem, y._elem, rounding_mode)
+  return x
+
+
+@op(torch.ops.aten.true_divide)
+def _aten_true_divide(x, y):
+  return x / y
+
+
+@op(torch.ops.aten.bmm)
+def _aten_bmm(x, y):
+  res = x @ y
+  return res
+  # return jnp.einsum('bnm,bmk->bnk', x, y)
+
+
+@op(torch.ops.aten.embedding)
+# embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False)
+def _aten_embedding(a, w, padding_idx=-1):
+  return jnp.take(a, w, axis=0)
+
+
+@op(torch.ops.aten.rsqrt)
+def _aten_rsqrt(x):
+  if isinstance(x, int):
+    x = float(x)
+  if x.dtype == jnp.int32:
+    x = x.astype(jnp.float32)
+  return jax.lax.rsqrt(x)
+
+
+@op(torch.ops.aten.expand)
+@op(torch.ops.aten.expand_copy)
+def _aten_expand(x, dims):
+  def fix_dims(d, xs):
+    if d == -1:
+      return xs
+    return d
+
+  dims = [fix_dims(p, s) for p, s in zip(dims, x.shape)]
+  return jnp.broadcast_to(x, dims)
+
+
+@op(torch.ops.aten.dot)
+def _aten_dot(x, y):
+  return jnp.dot(x, y)
+
+
+@op(torch.ops.aten._to_copy)
+def _aten__to_copy(self, **kwargs):
+  dtype = tensor.t2j_dtype(kwargs["dtype"])
+  if dtype != self.dtype:
+    return self.astype(dtype)
+  return jnp.copy(self)
+
+
+@op(torch.ops.aten.empty)
+def _aten_empty(sizes, **kwargs):
+  return jnp.zeros(sizes)
+
+
+@op(torch.ops.aten.index_put_)
+@op(torch.ops.aten.index_put)
+def _aten_index_put(self, indexes, values, accumulate=False):
+  indexes = [slice(None, None, None) if i is None else i for i in indexes]
+  indexes = tuple(indexes)
+  if accumulate:
+    return self.at[indexes].add(values)
+  else:
+    return self.at[indexes].set(values)
+
+
+@op(torch.ops.aten.index)
+@op(torch.ops.aten._unsafe_index)
+@op(torch.ops.aten.index.Tensor)
+def _aten_index(self, indexes):
+  indexes = [slice(None, None, None) if i is None else i for i in indexes]
+  indexes = tuple(indexes)
+  return self[indexes]
+
+
+@op(torch.ops.aten.split)
+@op(torch.ops.aten.split_copy)
+@op(torch.ops.aten.split_with_sizes)
+def split_with_sizes(x, sizes, dim=0):
+  """Splits an array `x` into sub-arrays based on static sizes `sizes`.
+
+  Args:
+    x: The input array to split.
+    sizes: A 1D array of integer sizes for each sub-array.
+
+  Returns:
+    A list of sub-arrays.
+  """
+  if isinstance(sizes, int):
+    # split equal size
+    new_sizes = [sizes] * (x.shape[dim] // sizes)
+    sizes = new_sizes
+  rank = x.ndim
+  splits = np.cumsum(sizes)  # Cumulative sum for split points
+
+  def make_range(rank, dim, start, end):
+    res = [slice(None, None, None)] * rank
+    res[dim] = slice(start, end)
+    return tuple(res)
+
+  return [
+    x[make_range(rank, dim, start, end)]
+    for start, end in zip([0] + list(splits[:-1]), splits)
+  ]
+
+
+@op(torch.ops.aten.permute)
+@op(torch.ops.aten.permute_copy)
+def permute(t, dims):
+  return jnp.transpose(t, dims)
+
+
+@op(torch.ops.aten.unsqueeze)
+@op(torch.ops.aten.unsqueeze_copy)
+@op(torch.ops.aten.unsqueeze.default)
+def _aten_unsqueeze(self, dim):
+  if dim < 0:
+    dim += self.ndim + 1
+  return jnp.expand_dims(self, dim)
+
+
+@op(torch.ops.aten.ne)
+def _aten_ne(x, y):
+  return jnp.not_equal(x, y)
+
+
+@op(torch.ops.aten.cumsum)
+def _aten_cumsum(x, y, dtype=None):
+  if dtype:
+    dtype = tensor.t2j_dtype(dtype)
+  res = jnp.cumsum(x, y, dtype)
+  return res
+
+
+@op(torch.ops.aten.native_layer_norm)
+def _aten_native_layer_norm(
+  input, normalized_shape, weight=None, bias=None, eps=1e-5
+):
+  """Implements layer normalization in Jax as defined by `aten::native_layer_norm`.
+
+  Args:
+    input: The input tensor.
+    normalized_shape: A list of integer dimensions to be normalized over.
+    weight: Optional weight tensor for the affine transformation.
+    bias: Optional bias tensor for the affine transformation.
+    eps: A small epsilon value for numerical stability.
+
+  Returns:
+    output: The normalized tensor.
+    mean: The calculated mean tensor.
+    std: The calculated standard deviation tensor.
+  """
+  if isinstance(normalized_shape, int):
+    normalized_shape = [normalized_shape]
+  axis = [i for i, d in enumerate(input.shape) if d in normalized_shape]
+
+  # Calculate mean and standard deviation
+  mean = jnp.mean(input, axis=axis, keepdims=True)
+  var = jnp.var(input, axis=axis, keepdims=True)
+  rstd = jax.lax.rsqrt(var + eps)
+
+  # Normalize the input
+  norm_x = (input - mean) * rstd
+
+  # Apply affine transformation (if provided)
+  if weight is not None:
+    norm_x *= weight
+  if bias is not None:
+    norm_x += bias
+  return norm_x, mean, rstd
+
+
+# - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+@op(torch.ops.aten.addmm)
+@op(torch.ops.aten.addmv)
+def _aten_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0):
+  alpha = jnp.array(alpha).astype(mat1.dtype)
+  beta = jnp.array(beta).astype(mat1.dtype)
+  self *= beta
+  self += alpha * jnp.matmul(mat1, mat2)
+  return self
+
+
+@op(torch.ops.aten.addbmm.default)
+def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1):
+  alpha = jnp.array(alpha).astype(batch1.dtype)
+  beta = jnp.array(beta).astype(batch1.dtype)
+  mm = jnp.einsum("bxy, byz -> xz", batch1, batch2)
+  return jax.lax.cond(
+    beta == 0, lambda: alpha * mm, lambda: beta * input + alpha * mm
+  )
+
+
+@op(torch.ops.aten.gelu)
+def _aten_gelu(self, *, approximate="none"):
+  approx = approximate == "tanh"
+  return jax.nn.gelu(self, approx)
+
+
+@op(torch.ops.aten.squeeze)
+@op(torch.ops.aten.squeeze_copy)
+def _aten_squeeze_dim(self, dim):
+  """Squeezes a Jax tensor by removing a single dimension of size 1.
+
+  Args:
+    self: The input tensor.
+    dim: The dimension to squeeze.
+
+  Returns:
+    The squeezed tensor with the specified dimension removed if it is 1,
+    otherwise the original tensor is returned.
+  """
+
+  # Validate input arguments
+  if not isinstance(self, jnp.ndarray):
+    raise TypeError(f"Expected a Jax tensor, got {type(self)}.")
+  if isinstance(dim, int):
+    dim = [dim]
+
+  # Check if the specified dimension has size 1
+  if all([self.shape[d] != 1 for d in dim]):
+    return self
+
+  # Use slicing to remove the dimension if it is 1
+  new_shape = list(self.shape)
+
+  def fix_dim(p):
+    if p < 0:
+      return p + len(self.shape)
+    return p
+
+  dim = [fix_dim(d) for d in dim]
+  new_shape = [p for i, p in enumerate(self.shape) if i not in dim or p != 1]
+  return self.reshape(new_shape)
+
+
+@op(torch.ops.aten.convolution)
+def _aten_convolution(
+  input,
+  weight,
+  bias,
+  stride,
+  padding,
+  dilation,
+  transposed,
+  output_padding,
+  groups,
+):
+  if transposed:
+    raise NotImplementedError("Transposed convolution is not implemented.")
+
+  def make_padding(padding):
+    return ((p, p) for p in padding)
+
+  def create_default_conv_dimension_numbers(num_spatial_dims):
+    # Ref: https://github.com/openxla/xla/blob/main/xla/client/xla_builder.cc#L4211
+    # (batch dimension, feature dimension, spatial dimensions...)
+    lhs_spec = [0, 1]
+    # (out feature dimension, in feature dimension, spatial dimensions...)
+    rhs_spec = [0, 1]
+    # (batch dimension, feature dimension, spatial dimensions...)
+    out_spec = [0, 1]
+    for i in range(0, num_spatial_dims):
+      lhs_spec.append(i + 2)
+      rhs_spec.append(i + 2)
+      out_spec.append(i + 2)
+    return jax.lax.ConvDimensionNumbers(
+      *map(tuple, (lhs_spec, rhs_spec, out_spec))
+    )
+
+  res = jax.lax.conv_general_dilated(
+    input,
+    weight,
+    stride,
+    make_padding(padding),
+    lhs_dilation=(1,) * len(stride),
+    rhs_dilation=dilation,
+    dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
+    feature_group_count=groups,
+    batch_group_count=1,
+  )
+
+  if bias is not None:
+    # TODO(qihqi): bias always on channel?
+    if len(bias.shape) == 1:
+      shape = [1] * len(res.shape)
+      shape[1] = bias.shape[0]
+      bias = bias.reshape(tuple(shape))
+    res = res + bias
+  return res
+
+
+# _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps)
+@op(torch.ops.aten._native_batch_norm_legit)
+def _aten__native_batch_norm_legit(
+  input, weight, bias, running_mean, running_var, training, momentum, eps
+):
+  """JAX implementation of batch normalization with optional parameters.
+  Refers to https://github.com/pytorch/pytorch/blob/cd3a71f754a2248bcfe500de7c9860bd7d2002bf/torch/_decomp/decompositions.py#L1713.
+
+  Args:
+    input (DeviceArray): Input data (N, C, H, W).
+    running_mean ([DeviceArray]): Running mean of input (C,).
+    running_var ([DeviceArray]): Running variance of input (C,).
+    weight (Optional[DeviceArray]): Scaling factor (gamma) (C,). Can be None.
+    bias (Optional[DeviceArray]): Shift factor (beta) (C,). Can be None.
+    training (bool): If True, use batch statistics for normalization. 
+                     If False, use running statistics.
+    momentum (float): Momentum factor for updating running statistics.
+    eps (float): Small constant for numerical stability.
+
+  Returns:
+    DeviceArray: Normalized output
+    DeviceArray: Batch mean (C,) or empty if training is False
+    DeviceArray: Reversed batch variance (C,) or empty if training is False
+  """
+  reduction_dims = [0] + list(range(2, input.ndim))
+  reshape_dims = [1, -1] + [1]*(input.ndim-2)
+
+  if training:
+    # Calculate batch mean and variance
+    mean = jnp.mean(input, axis=reduction_dims, keepdims=True)
+    saved_mean = jnp.squeeze(mean, reduction_dims)
+    var = jnp.var(input, axis=reduction_dims) 
+    rstd = jax.lax.rsqrt(var.reshape(reshape_dims) + eps)
+    # Update running statistics using momentum
+    running_mean = (1 - momentum) * running_mean + momentum * saved_mean
+    running_var = (1 - momentum) * running_var + momentum * var
+    saved_rstd = jnp.squeeze(rstd, reduction_dims)
+  else:
+    rstd = jax.lax.rsqrt(running_var.reshape(reshape_dims) + eps) 
+    saved_mean = jnp.array([])  # No need to calculate batch statistics in inference mode
+    saved_rstd = jnp.array([])
+
+  # Normalize
+  if training:
+    # use batch statistics if training
+    x_hat = (input - mean) * rstd 
+  else:
+    # Use running statistics in inference mode
+    x_hat = (input - running_mean.reshape(reshape_dims)) * rstd
+
+  # Scale and shift
+  if weight is not None:
+    x_hat *= weight.reshape(reshape_dims)  # Reshape weight for broadcasting
+  if bias is not None:
+    x_hat += bias.reshape(reshape_dims)    # Reshape bias for broadcasting
+
+  return x_hat, saved_mean, saved_rstd 
+
+
+
+@op(torch.ops.aten._native_batch_norm_legit_no_training)
+def _aten__native_batch_norm_legit_no_training(
+  input, weight, bias, running_mean, running_var, momentum, eps
+):
+    return _aten__native_batch_norm_legit(
+    input, weight, bias, running_mean, running_var, False, momentum, eps
+  )
+
+
+@op(torch.ops.aten.relu)
+def _aten_relu(self):
+  return jax.nn.relu(self)
+
+
+@op(torch.ops.aten.cat)
+def _aten_cat(tensors, dims=0):
+  return jnp.concatenate(tensors, dims)
+
+
+@op(torch.ops.aten.max_pool2d_with_indices)
+@op(torch.ops.aten.max_pool3d_with_indices)
+def _aten_max_pool2d_with_indices(
+  inputs, kernel_size, strides, padding=0, dilation=1, ceil_mode=False
+):
+  num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
+  kernel_size = tuple(kernel_size)
+  strides = tuple(strides)
+  if isinstance(padding, int):
+    padding = tuple((padding, padding) for _ in range(len(kernel_size)))
+  elif isinstance(padding, list):
+    padding = tuple((p, p) for p in padding)
+
+  window_shape = kernel_size
+  num_batch_dims = inputs.ndim - (len(window_shape) + 1)
+  strides = strides or (1,) * len(window_shape)
+  assert len(window_shape) == len(
+    strides
+  ), f"len({window_shape}) must equal len({strides})"
+  strides = (1,) * (1 + num_batch_dims) + strides
+  dims = (1,) * (1 + num_batch_dims) + window_shape
+
+  is_single_input = False
+  if num_batch_dims == 0:
+    # add singleton batch dimension because lax.reduce_window always
+    # needs a batch dimension.
+    inputs = inputs[None]
+    strides = (1,) + strides
+    dims = (1,) + dims
+    is_single_input = True
+
+  assert inputs.ndim == len(dims), f"len({inputs.shape}) != len({dims})"
+  if not isinstance(padding, str):
+    padding = tuple(map(tuple, padding))
+    assert len(padding) == len(window_shape), (
+      f"padding {padding} must specify pads for same number of dims as "
+      f"window_shape {window_shape}"
+    )
+    assert all(
+      [len(x) == 2 for x in padding]
+    ), f"each entry in padding {padding} must be length 2"
+    padding = ((0, 0), (0, 0)) + padding
+
+  indices = jnp.arange(np.prod(inputs.shape)).reshape(inputs.shape)
+
+  def reduce_fn(a, b):
+    ai, av = a
+    bi, bv = b
+    which = av > bv
+    return jnp.where(which, ai, bi), jnp.where(which, av, bv)
+
+  init_val = -jnp.inf
+  if inputs.dtype in (jnp.int32, jnp.int64):
+    init_val = -(1 << 31)
+  init_val = jnp.array(init_val).astype(inputs.dtype)
+
+  indices, y = jax.lax.reduce_window(
+    (indices, inputs), (0, init_val), reduce_fn, dims, strides, padding
+  )
+  if is_single_input:
+    indices = jnp.squeeze(indices, axis=0)
+    y = jnp.squeeze(y, axis=0)
+  return y, indices
+
+  batch_result = pool(
+    inputs, -jnp.inf, jax.lax.max, kernel_size, strides, padding
+  )
+  indices = pool(inputs, 0, jnp.argmax, kernel_size, strides, padding)
+  return batch_result, indices
+
+
+# TODO add more ops
+
+
+@op(torch.ops.aten.min)
+def _aten_min(x, axis=None):
+  return jnp.min(x, axis=axis), jnp.argmin(x, axis=axis).astype(jnp.int64)
+
+
+@op(torch.ops.aten.amin)
+def _aten_amin(x, dim=None, keepdim=False):
+  return _with_reduction_scalar(jnp.amin, x, dim, keepdim)
+
+
+@op(torch.ops.aten.argmin)
+def _aten_argmin(self, dim=None, keepdim=False):
+  return _with_reduction_scalar(jnp.argmin, self, dim, keepdim)
+
+
+@op(torch.ops.aten.sin)
+def _aten_sin(x):
+  return jnp.sin(x)
+
+
+@op(torch.ops.aten.sym_size)
+def _aten_sym_size(x, dim):
+  return x.shape[dim]
+
+
+@op(torch.ops.aten.var.correction)
+@op(torch.ops.prims.var)
+def _aten_var(x, dim=None, *, correction=1, keepdim=False, out=None):
+  return jnp.var(x, axis=dim, ddof=correction, keepdims=keepdim)
+
+
+@op(torch.ops.prims.broadcast_in_dim)
+def _prims_broadcast_in_dim(t, shape, broadcast_dimensions):
+  return jax.lax.broadcast_in_dim(
+    t, shape, broadcast_dimensions=broadcast_dimensions
+  )
+
+
+# aten.native_group_norm -- should use decomp table
+# func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+
+
+@op(torch.ops.aten.native_group_norm)
+def _aten_native_group_norm(input, weight, bias, N, C, HxW, group, eps=1e-5):
+  """Group Normalization implementation in JAX.
+
+  Args:
+    input: Input tensor. Expected shape (batch_size, channels, ... spatial dims
+      ...)
+    weight: Optional scaling (gamma) parameter. Shape (channels,)
+    bias: Optional shifting (beta) parameter. Shape (channels,)
+    N: Batch size.
+    C: Number of channels.
+    HxW: Product of spatial dimensions (number of elements per channel after
+      flattening).
+    group: Number of groups for Group Normalization.
+    eps: Small value added for numerical stability.
+
+  Returns:
+    A tuple of (normalized_output, mean, rstd)
+  """
+
+  input_shape = input.shape
+
+  # Reshape for group-wise normalization
+  reshaped_input = jnp.reshape(input, (1, N * group, -1))
+
+  # **Core Group Normalization**
+  def group_norm_body(x):  # Function to apply within each group
+    mean = jnp.mean(x, axis=-1, keepdims=True)
+    var = jnp.var(x, axis=-1, keepdims=True)
+    rstd = jax.lax.rsqrt(var + eps)  # Reciprocal of std with epsilon
+    normalized = (x - mean) * rstd
+    return normalized, mean, rstd
+
+  normalized, group_mean, group_rstd = jax.lax.map(
+    group_norm_body, reshaped_input
+  )
+
+  # Reshape back to original input shape
+  output = jnp.reshape(normalized, input_shape)
+
+  # **Affine transformation**
+  affine_shape = [
+    -1 if i == 1 else 1 for i in range(input.ndim)
+  ]  # Shape for broadcasting
+  if weight is not None and bias is not None:
+    output = bias.reshape(affine_shape) + output * weight.reshape(affine_shape)
+  elif weight is not None:
+    output = output * weight.reshape(affine_shape)
+  elif bias is not None:
+    output = output + bias.reshape(affine_shape)
+
+  # Reshape mean and rstd
+  mean = jnp.reshape(group_mean, (N, group))
+  rstd = jnp.reshape(group_rstd, (N, group))
+
+  return output, mean, rstd
+
+
+@op(torch.ops.aten.linalg_vector_norm)
+def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None):
+  """Calculates the vector norm along specified dimensions.
+
+  Args:
+      self: The input tensor.
+      ord: The order of the norm. Can be a float or 'inf', '-inf', 'fro'.
+        Default is 2 (Euclidean norm).
+      dim: Dimensions along which to calculate the norm. If None, the norm is
+        calculated over all dimensions.
+      keepdim: Whether to keep the reduced dimensions.
+      dtype: Optional data type for the output.
+
+  Returns:
+      The tensor containing the calculated vector norms.
+  """
+
+  if ord not in {2, float("inf"), float("-inf"), "fro"}:
+    raise ValueError(
+      f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and"
+      " 'fro'."
+    )
+
+  # Special cases (for efficiency and clarity)
+  if ord == 2:  # Euclidean norm
+    result = jnp.sqrt(jnp.sum(jnp.abs(self) ** 2, axis=dim, keepdims=keepdim))
+
+  elif ord == float("inf"):
+    result = jnp.max(jnp.abs(self), axis=dim, keepdims=keepdim)
+
+  elif ord == float("-inf"):
+    result = jnp.min(jnp.abs(self), axis=dim, keepdims=keepdim)
+
+  elif ord == "fro":  # Frobenius norm
+    result = jnp.sqrt(jnp.sum(jnp.abs(self) ** 2, axis=dim, keepdims=keepdim))
+
+  else:  # General case (e.g., ord = 1, ord = 3)
+    result = jnp.sum(jnp.abs(self) ** ord, axis=dim, keepdims=keepdim) ** (
+      1.0 / ord
+    )
+
+  # (Optional) dtype conversion
+  if dtype is not None:
+    result = result.astype(dtype)
+
+  return result
+
+
+# aten.reflection_pad1d
+@op(torch.ops.aten.reflection_pad1d)
+def _aten_reflection_pad1d(input, padding):
+  rank = len(input.shape)
+  pad_size = [(0, 0)] * rank
+  pad_size[-1] = padding
+  return jnp.pad(input, pad_size, mode="reflect")
+
+
+# aten.alias
+@op(torch.ops.aten.alias)
+def _aten_alias(self, *args):
+  return self
+
+
+# aten.sinh
+@op(torch.ops.aten.sinh)
+def _aten_sinh(self):
+  return jnp.sinh(self)
+
+
+# aten.native_layer_norm_backward
+@op(torch.ops.aten.native_layer_norm_backward)
+def _aten_native_layer_norm_backward(
+  grad_out, input, normalized_shape, weight, bias, eps=1e-5
+):
+  """Implements the backward pass of layer normalization in Jax as defined by `aten::native_layer_norm_backward`.
+
+  Args:
+    grad_out: The gradient of the output tensor.
+    input: The input tensor.
+    normalized_shape: A list of integer dimensions to be normalized over.
+    weight: Optional weight tensor for the affine transformation.
+    bias: Optional bias tensor for the affine transformation.
+    eps: A small epsilon value for numerical stability.
+
+  Returns:
+    A tuple of (grad_input, grad_weight, grad_bias).
+  """
+  return jax.lax.native_layer_norm_backward(
+    grad_out, input, normalized_shape, weight, bias, eps
+  )
+
+
+# aten.reflection_pad3d_backward
+# aten.reflection_pad2d
+
+
+# aten.atanh
+@op(torch.ops.aten.atanh)
+def _aten_atanh(self):
+  return jnp.arctanh(self)
+
+
+# aten.bitwise_not
+@op(torch.ops.aten.bitwise_not)
+def _aten_bitwise_not(self):
+  return ~self
+
+
+# aten.embedding_dense_backward
+
+
+# aten.sum
+@op(torch.ops.aten.sum)
+def _aten_sum(self, dim=None, keepdim=False, dtype=None):
+  if not dim:
+    dim = None
+  return jnp.sum(self, axis=dim, keepdims=keepdim, dtype=dtype)
+
+
+# aten.sqrt
+@op(torch.ops.aten.sqrt)
+def _aten_sqrt(self):
+  return jnp.sqrt(self)
+
+
+@op(torch.ops.aten.tan)
+def _aten_tanh(self):
+  return jnp.tan(self)
+
+
+# aten.tanh
+@op(torch.ops.aten.tanh)
+def _aten_tanh(self):
+  return jnp.tanh(self)
+
+
+# aten.ceil
+@op(torch.ops.aten.ceil)
+def _aten_ceil(self):
+  return jnp.ceil(self)
+
+
+# aten.asin
+@op(torch.ops.aten.asin)
+def _aten_asin(self):
+  return jnp.arcsin(self)
+
+
+# aten.minimum
+@op(torch.ops.aten.minimum)
+def _aten_minimum(self, other):
+  return jnp.minimum(self, other)
+
+
+# aten.max_pool2d_backward
+
+
+def _scatter_index(dim, index):
+  """Returns a tuple of indexes;
+
+  The first is to select in input (to modify),
+  the second is to select from the values.
+  """
+  index_shape = list(index.shape)
+  input_indexes = []
+  source_indexes = []
+  for i in range(len(index_shape)):
+    source_indexes.append(slice(0, index_shape[i]))
+    if i == dim:
+      input_indexes.append(index)
+    else:
+      target_shape = [1] * len(index_shape)
+      target_shape[i] = index_shape[i]
+      input_indexes.append(
+        jnp.broadcast_to(
+          jnp.arange(index_shape[i]).reshape(target_shape), index_shape
+        )
+      )
+  return tuple(input_indexes), tuple(source_indexes)
+
+
+# aten.scatter_add
+@op(torch.ops.aten.scatter_add)
+def _aten_scatter_add(input, dim, index, src):
+  """JAX implementation of scatter, mimicking torch.scatter behavior"""
+
+  input_indexes, source_indexes = _scatter_index(dim, index)
+  return input.at[input_indexes].add(src[source_indexes])
+
+
+# aten.logical_not
+
+
+# aten.sign
+@op(torch.ops.aten.sign)
+def _aten_sign(x):
+  return jnp.sign(x)
+
+
+# aten.sigmoid
+@op(torch.ops.aten.sigmoid)
+def _aten_sigmoid(x):
+  if x.dtype in (jnp.int32, jnp.int64):
+    x = x.astype(jnp.float32)
+  return jax.nn.sigmoid(x)
+
+
+# implement aten.asinh in jax
+@op(torch.ops.aten.asinh)
+def _aten_asinh(self):
+  return jnp.arcsinh(self)
+
+
+# aten.atan
+@op(torch.ops.aten.atan)
+def _aten_atan(self):
+  return jnp.arctan(self)
+
+
+# aten.scatter_reduce
+@op(torch.ops.aten.scatter_reduce)
+def _aten_scatter_reduce(input, dim, index, src, reduce, *, include_self=True):
+  input_indexes, source_indexes = _scatter_index(dim, index)
+  if reduce == "sum":
+    return input.at[input_indexes].add(src[source_indexes])
+  elif reduce == "prod":
+    return input.at[input_indexes].multiply(src[source_indexes])
+  elif reduce == "mean":
+    return input.at[input_indexes].add(src[source_indexes])
+  elif reduce == "amax":
+    return input.at[input_indexes].max(src[source_indexes])
+  elif reduce == "amin":
+    return input.at[input_indexes].min(src[source_indexes])
+  else:
+    raise RuntimeError("Unknow reduction type: ", reduce)
+
+
+# aten.acos
+@op(torch.ops.aten.acos)
+def _aten_acos(self):
+  return jnp.arccos(self)
+
+
+# aten.sym_storage_offset
+# aten.native_layer_norm_backward
+# aten.max_pool3d_with_indices
+
+
+# aten.gt
+@op(torch.ops.aten.gt)
+def _aten_gt(self, other):
+  return self > other
+
+
+# aten.pixel_shuffle
+@op(torch.ops.aten.pixel_shuffle)
+def _aten_pixel_shuffle(x, upscale_factor):
+  """PixelShuffle implementation in JAX.
+
+  Args:
+    x: Input tensor. Typically a feature map.
+    upscale_factor: Integer by which to upscale the spatial dimensions.
+
+  Returns:
+    Tensor after PixelShuffle operation.
+  """
+
+  batch_size, channels, height, width = x.shape
+
+  if channels % (upscale_factor**2) != 0:
+    raise ValueError(
+      "Number of channels must be divisible by the square of the upscale factor."
+    )
+
+  new_channels = channels // (upscale_factor**2)
+  new_height = height * upscale_factor
+  new_width = width * upscale_factor
+
+  x = x.reshape(
+    batch_size, new_channels, upscale_factor, upscale_factor, height, width
+  )
+  x = jnp.transpose(
+    x, (0, 1, 2, 4, 3, 5)
+  )  # Move channels to spatial dimensions
+  x = x.reshape(batch_size, new_channels, new_height, new_width)
+
+  return x
+
+
+# aten.sym_stride
+# aten.lt
+@op(torch.ops.aten.lt)
+def _aten_lt(self, other):
+  return self < other
+
+
+def pool(inputs, init, reduce_fn, window_shape, strides, padding):
+  """Helper function to define pooling functions.
+
+  Pooling functions are implemented using the ReduceWindow XLA op.
+  NOTE: Be aware that pooling is not generally differentiable.
+  That means providing a reduce_fn that is differentiable does not imply that
+  pool is differentiable.
+
+  Args:
+    inputs: input data with dimensions (batch, window dims..., features).
+    init: the initial value for the reduction
+    reduce_fn: a reduce function of the form ``(T, T) -> T``.
+    window_shape: a shape tuple defining the window to reduce over.
+    strides: a sequence of ``n`` integers, representing the inter-window
+      strides (default: ``(1, ..., 1)``).
+    padding: either the string ``'SAME'``, the string ``'VALID'``, or a sequence
+      of ``n`` ``(low, high)`` integer pairs that give the padding to apply before
+      and after each spatial dimension.
+  Returns:
+    The output of the reduction for each window slice.
+  """
+  num_batch_dims = inputs.ndim - (len(window_shape) + 1)
+  strides = strides or (1,) * len(window_shape)
+  assert len(window_shape) == len(
+    strides
+  ), f"len({window_shape}) must equal len({strides})"
+  strides = (1,) * (1 + num_batch_dims) + strides
+  dims = (1,) * (1 + num_batch_dims) + window_shape
+
+  is_single_input = False
+  if num_batch_dims == 0:
+    # add singleton batch dimension because lax.reduce_window always
+    # needs a batch dimension.
+    inputs = inputs[None]
+    strides = (1,) + strides
+    dims = (1,) + dims
+    is_single_input = True
+
+  assert inputs.ndim == len(dims), f"len({inputs.shape}) != len({dims})"
+  if not isinstance(padding, str):
+    padding = tuple(map(tuple, padding))
+    assert len(padding) == len(window_shape), (
+      f"padding {padding} must specify pads for same number of dims as "
+      f"window_shape {window_shape}"
+    )
+    assert all(
+      [len(x) == 2 for x in padding]
+    ), f"each entry in padding {padding} must be length 2"
+    padding = ((0, 0), (0, 0)) + padding
+  y = jax.lax.reduce_window(inputs, init, reduce_fn, dims, strides, padding)
+  if is_single_input:
+    y = jnp.squeeze(y, axis=0)
+  return y
+
+
+@op(torch.ops.aten._adaptive_avg_pool3d)
+def _aten_adaptive_avg_pool3d(x, output_shape):
+  return _aten_adaptive_avg_pool(x, output_shape, 3)
+
+
+@op(torch.ops.aten._adaptive_avg_pool2d)
+def _aten_adaptive_avg_pool3d(x, output_shape):
+  return _aten_adaptive_avg_pool(x, output_shape, 2)
+
+
+def _aten_adaptive_avg_pool(x, output_shape, pool_dim):
+  def adaptive_kernel_size(input_shape, output_shape):
+    sizes = [1, 1]
+    spatial_dim_off = len(input_shape) - pool_dim
+    for spatial_dim in range(pool_dim):
+      sizes.append(
+        input_shape[spatial_dim_off + spatial_dim] // output_shape[spatial_dim]
+      )
+    return tuple(sizes)
+
+  kernel_sizes = adaptive_kernel_size(x.shape, output_shape)
+  y = pool(x, 0.0, jax.lax.add, kernel_sizes, kernel_sizes, padding="VALID")
+
+  div_shape = list(x.shape)
+  num_batch_dims = len(x.shape) - pool_dim - 1
+  div_shape[num_batch_dims] = 1
+  div_shape = tuple(div_shape)
+  if len(div_shape) - 2 == len(kernel_sizes):
+    div_shape = (1,) + div_shape[1:]
+  y = y / pool(
+    jnp.ones(div_shape), 0.0, jax.lax.add, kernel_sizes, kernel_sizes, "VALID"
+  )
+  return y
+
+
+# aten.avg_pool2d
+@op(torch.ops.aten.avg_pool2d)
+@op(torch.ops.aten.avg_pool3d)
+def _aten_avg_pool(
+  inputs,
+  kernel_size,
+  strides=None,
+  padding=0,
+  ceil_mode=False,
+  count_include_pad=True,
+  divisor_override=None,
+):
+  num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
+  kernel_size = tuple(kernel_size)
+  strides = tuple(strides)
+  if isinstance(padding, int):
+    padding = tuple((padding, padding) for _ in range(len(kernel_size)))
+  elif isinstance(padding, list):
+    padding = tuple((p, p) for p in padding)
+
+  y = pool(inputs, 0.0, jax.lax.add, kernel_size, strides, padding)
+  if count_include_pad:
+    y = y / np.prod(kernel_size)
+  else:
+    div_shape = list(inputs.shape)
+    div_shape[num_batch_dims] = 1
+    div_shape = tuple(div_shape)
+    if len(div_shape) - 2 == len(kernel_size):
+      div_shape = (1,) + div_shape[1:]
+    y = y / pool(
+      jnp.ones(div_shape), 0.0, jax.lax.add, kernel_size, strides, padding
+    )
+  return y
+
+
+# aten.sym_numel
+# aten.reciprocal
+@op(torch.ops.aten.reciprocal)
+def _aten_reciprocal(a):
+  return 1 / a
+
+
+# aten.scatter
+@op(torch.ops.aten.select_scatter)
+def _aten_select_scatter(input, src, dim, index):
+  input_indexes = []
+  for x in range(len(input.shape)):
+    if x == dim:
+      input_indexes.append(index)
+    else:
+      input_indexes.append(slice(None, None, None))
+  return input.at[tuple(input_indexes)].set(src)
+
+
+@op(torch.ops.aten.scatter.src)
+def _aten_scatter_src(input, dim, index, src, reduce=None):
+  input_index, source_indexes = _scatter_index(dim, index)
+  return input.at[input_index].set(src[source_indexes])
+
+
+@op(torch.ops.aten.scatter.value)
+def _aten_scatter(input, dim, index, src, reduce=None):
+  input_index, source_indexes = _scatter_index(dim, index)
+  return input.at[input_index].set(src)
+
+
+# aten.acosh
+@op(torch.ops.aten.acosh)
+def _aten_acosh(self):
+  return jnp.arccosh(self)
+
+
+# aten.avg_pool2d_backward
+# aten.col2im
+# aten.avg_pool3d
+# aten.round
+@op(torch.ops.aten.round)
+def _aten_round(input, decimals=0):
+  return jnp.round(input, decimals)
+
+
+# aten.max
+@op(torch.ops.aten.max)
+def _aten_max(self, dim=None, keepdim=False):
+  return jnp.max(self, axis=dim, keepdims=keepdim), jnp.argmax(
+    self, axis=dim, keepdims=keepdim
+  )
+
+
+# aten.maximum
+@op(torch.ops.aten.maximum)
+def _aten_maximum(self, other):
+  return jnp.maximum(self, other)
+
+
+# aten.abs
+@op(torch.ops.aten.abs)
+def _aten_abs(self):
+  return jnp.abs(self)
+
+
+# generate aten.amax only
+@op(torch.ops.aten.amax)
+def _aten_amax(self, dim=None, keepdim=False):
+  return _with_reduction_scalar(jnp.amax, self, dim, keepdim)
+
+
+def _with_reduction_scalar(jax_func, self, dim, keepdim):
+  expanded = False
+  if self.ndim == 0:
+    # for self of rank 0:
+    # torch.any(x, 0), torch.any(x, -1) works;
+    # torch.any(x, 1) throws out of bounds, so it's
+    # behavior is the same as a jnp array of rank 1
+    expanded = True
+    self = jnp.expand_dims(self, 0)
+  res = jax_func(self, axis=dim, keepdims=keepdim)
+  if expanded:
+    res = res.squeeze()
+  return res
+
+
+# aten.any
+@op(torch.ops.aten.any)
+def _aten_any(self, dim=None, keepdim=False):
+  return _with_reduction_scalar(jnp.any, self, dim, keepdim)
+
+
+# aten.arange
+@op(torch.ops.aten.arange.start_step)
+@op(torch.ops.aten.arange.start)
+@op(torch.ops.aten.arange.default)
+def _aten_arange(
+  start,
+  end=None,
+  step=1,
+  *,
+  dtype=None,
+  layout=None,
+  requires_grad=False,
+  device=None,
+  pin_memory=False,
+):
+  if end is None:
+    end = start
+    start = 0
+  if dtype:
+    dtype = tensor.t2j_dtype(dtype)
+  return jnp.arange(
+    start,
+    end,
+    step,
+    dtype=dtype,
+  )
+
+
+# aten.argmax
+@op(torch.ops.aten.argmax)
+def _aten_argmax(self, dim=None, keepdim=False):
+  return _with_reduction_scalar(jnp.argmax, self, dim, keepdim)
+
+
+# aten.as_strided
+@op(torch.ops.aten.as_strided)
+@op(torch.ops.aten.as_strided_copy)
+def _aten_as_strided(x, sizes, strides, storage_offset=None):
+  ind = jnp.zeros(sizes, dtype=jnp.int32)
+
+  for i, (size, stride) in enumerate(zip(sizes, strides)):
+    result_shape = (1,) * i + (size,) + (1,) * (len(sizes) - i - 1)
+    indexes = (jnp.arange(size) * stride).reshape(result_shape)
+    ind += indexes
+
+  return jnp.ravel(x)[ind]
+
+
+# aten.atan2
+@op(torch.ops.aten.atan2)
+def _aten_atan2(self, other):
+  return jnp.arctan2(self, other)
+
+
+# aten.bitwise_and
+@op(torch.ops.aten.bitwise_and)
+def _aten_bitwise_and(self, other):
+  return self & other
+
+
+# aten.bitwise_or
+@op(torch.ops.aten.bitwise_or)
+def _aten_bitwise_or(self, other):
+  return self | other
+
+
+# aten.bitwise_xor
+@op(torch.ops.aten.bitwise_xor)
+def _aten_bitwise_xor(self, other):
+  return self ^ other
+
+
+# aten.clamp
+@op(torch.ops.aten.clamp.default)
+@op(torch.ops.aten.clamp.Tensor)
+def _aten_clamp(self, min=None, max=None):
+  return jnp.clip(self, min, max)
+
+
+# aten.constant_pad_nd
+@op(torch.ops.aten.constant_pad_nd)
+def _aten_constant_pad_nd(input, padding, value=0):
+  # NOTE: Torch padding is flat and reversed: (1, 1, 2, 2)
+  #  means last dim get padded 1 in front and 1 in back;
+  #  and second last dim get padded 2 in front and 2 in back.
+  # Jax padding tuple of 2-tuple: the same padding is
+  # [(0, 0), ..., (2,2), (1,1)]
+  m = len(padding)
+  rev_padding = [(padding[i - 1], padding[i]) for i in range(m - 1, 0, -2)]
+  pad_dim = tuple(([(0, 0)] * (len(input.shape) - m // 2)) + rev_padding)
+  return jnp.pad(input, pad_dim, mode="constant", constant_values=value)
+
+
+# aten.convolution_backward
+@op(torch.ops.aten.copy)
+@op(torch.ops.aten.lift_fresh_copy)
+def _aten_copy(x):
+  return jnp.copy(x)
+
+
+@op(torch.ops.aten._cdist_forward)
+def _aten_cdist_forward(x1, x2, p, compute_mode=""):
+  # x1 is B x P x M
+  # x2 is B x Q x M
+  # res is B x P x Q
+  x1 = jnp.expand_dims(x1, len(x1.shape) - 1)
+  x2 = jnp.expand_dims(x2, len(x2.shape) - 2)
+  return jnp.linalg.norm(x1 - x2, ord=p, axis=-1)
+
+
+@op(torch.ops.aten._pdist_forward)
+def _aten__pdist_forward(x, p):
+  pairwise_dists = _aten_cdist_forward(x, x, p)
+  condensed_dists = pairwise_dists[
+    jnp.triu_indices(pairwise_dists.shape[0], k=1)
+  ]
+  return condensed_dists
+
+
+# aten.cos
+@op(torch.ops.aten.cos)
+def _aten_cos(input):
+  return jnp.cos(input)
+
+
+# aten.cosh
+@op(torch.ops.aten.cosh)
+def _aten_cosh(input):
+  return jnp.cosh(input)
+
+
+# aten.diagonal
+@op(torch.ops.aten.diagonal)
+def _aten_diagonal(input, offset=0, dim1=0, dim2=1):
+  return jnp.diagonal(input, offset, dim1, dim2)
+
+
+# aten.empty_strided
+# aten.eq
+@op(torch.ops.aten.eq)
+def _aten_eq(input1, input2):
+  return input1 == input2
+
+
+# aten.erf
+@op(torch.ops.aten.erf)
+def _aten_erf(x):
+  if x.dtype in (jnp.int32, jnp.int64):
+    x = x.astype(jnp.float32)
+  return jax.lax.erf(x)
+
+
+# aten.exp
+@op(torch.ops.aten.exp)
+def _aten_exp(input):
+  return jnp.exp(input)
+
+
+# aten.expm1
+@op(torch.ops.aten.expm1)
+def _aten_expm1(input):
+  return jnp.expm1(input)
+
+
+# aten.fill
+@op(torch.ops.aten.fill)
+@op(torch.ops.aten.full_like)
+def _aten_fill(x, value, dtype=None, pin_memory=None, memory_format=None):
+  if dtype is None:
+    dtype = x.dtype
+  else:
+    dtype = tensor.t2j_dtype(dtype)
+  return jnp.full(x.shape, value, dtype)
+
+
+# aten.flip
+@op(torch.ops.aten.flip)
+def _aten_flip(input, dims):
+  if dims is not None:
+    return jnp.flip(input, tuple(dims))
+  else:
+    return jnp.flip(input)
+
+
+# aten.floor
+@op(torch.ops.aten.floor)
+def _aten_floor(input):
+  return jnp.floor(input)
+
+
+# aten.fmod
+@op(torch.ops.aten.fmod)
+def _aten_fmod(input, other):
+  return input - other * _aten_div(input, other, "trunc")
+
+
+# aten.gather
+@op(torch.ops.aten.gather)
+def _aten_gather(input, dim, index):
+  input_indexes, source_indexes = _scatter_index(dim, index)
+  return input[input_indexes]
+
+
+# aten.ge
+@op(torch.ops.aten.ge)
+def _aten_ge(self, other):
+  return self >= other
+
+
+@op(torch.ops.aten.glu)
+@op(torch.ops.aten.glu.default)
+def _aten_glu(x, dim=-1):
+  return jax.nn.glu(x, dim)
+
+
+# aten.hardtanh
+@op(torch.ops.aten.hardtanh)
+def _aten_hardtanh(input, min_val=-1.0, max_val=1.0, inplace=False):
+  return jnp.clip(input, min_val, max_val)
+
+
+# aten.isinf
+@op(torch.ops.aten.isinf)
+def _aten_isinf(input):
+  return jnp.isinf(input)
+
+
+# aten.isnan
+@op(torch.ops.aten.isnan)
+def _aten_isnan(input):
+  return jnp.isnan(input)
+
+
+@op(torch.ops.aten.le)
+def _aten_le(self, other):
+  return self <= other
+
+
+# aten.leaky_relu
+@op(torch.ops.aten.leaky_relu)
+def _aten_leaky_relu(x, negative_slope):
+  return jax.nn.leaky_relu(x, negative_slope)
+
+
+# aten.log
+@op(torch.ops.aten.log)
+def _aten_log(x):
+  return jnp.log(x)
+
+
+# aten.log10
+@op(torch.ops.aten.log10)
+def _aten_log10(x):
+  return jnp.log10(x)
+
+
+# aten.log1p
+@op(torch.ops.aten.log1p)
+def _aten_log1p(x):
+  return jnp.log1p(x)
+
+
+# aten.log2
+@op(torch.ops.aten.log2)
+def _aten_log2(x):
+  return jnp.log2(x)
+
+
+# aten.logical_and
+@op(torch.ops.aten.logical_and)
+def _aten_logical_and(self, other):
+  return jnp.logical_and(self, other)
+
+
+# aten.logical_or
+@op(torch.ops.aten.logical_or)
+def _aten_logical_or(self, other):
+  return jnp.logical_or(self, other)
+
+
+# aten.logical_not
+@op(torch.ops.aten.logical_not)
+def _aten_logical_not(self):
+  return jnp.logical_not(self)
+
+
+# aten.log_softmax
+@op(torch.ops.aten._log_softmax)
+def _aten_log_softmax(self, axis=-1, half_to_float=False):
+  return jax.nn.log_softmax(self, axis)
+
+
+# aten.max_pool3d_backward
+# aten.logical_xor
+@op(torch.ops.aten.logical_xor)
+def _aten_logical_xor(self, other):
+  return jnp.logical_xor(self, other)
+
+
+# aten.max_pool2d_with_indices_backward
+# aten.native_dropout
+# aten.native_group_norm_backward
+# aten.neg
+@op(torch.ops.aten.neg)
+def _aten_neg(x):
+  return -1 * x
+
+
+# aten.nonzero
+@op(torch.ops.aten.nonzero)
+def _aten_nonzero(x):
+  index_tuple = jnp.nonzero(x)
+  index_tuple = [jnp.expand_dims(p, -1) for p in index_tuple]
+  return jnp.concatenate(index_tuple, axis=-1)
+
+
+# aten.prod
+
+
+@op(torch.ops.aten.prod)
+def _aten_prod(self, dim=None, keepdim=False):
+  return jnp.prod(self, axis=dim, keepdims=keepdim)
+
+
+# aten.randperm
+
+
+# aten.reflection_pad3d
+
+
+# aten.remainder
+@op(torch.ops.aten.remainder)
+def _aten_remainder(inputs, other):
+  return inputs % other
+
+
+# aten.repeat
+@op(torch.ops.aten.repeat)
+def _aten_repeat(x, reps):
+  return jnp.tile(x, reps)
+
+
+# aten.replication_pad2d
+# aten.replication_pad3d
+# aten.roll
+@op(torch.ops.aten.roll)
+def _aten_roll(input, shifts, dims=None):
+  return jnp.roll(input, shifts, dims)
+
+
+# aten.scalar_tensor
+# aten.slice_scatter
+@op(torch.ops.aten.slice_scatter)
+def _aten_slice_scatter(input, src, dim=0, start=None, end=None, step=1):
+  input_index = []
+  for x in range(len(input.shape)):
+    if x == dim:
+      input_index.append(slice(start, end, step))
+    else:
+      input_index.append(slice(None, None, None))
+  return input.at[tuple(input_index)].set(src)
+
+
+# aten.sort
+# torch.sort(input, dim=-1, descending=False, stable=False, *, out=None)
+@op(torch.ops.aten.sort)
+def _aten_sort(a, dim=-1, descending=False, stable=False):
+  return (
+    jnp.sort(a, axis=dim, stable=stable, descending=descending),
+    jnp.argsort(a, axis=dim, stable=stable, descending=descending),
+  )
+
+
+# aten.sym_size
+
+
+# aten.topk
+@op(torch.ops.aten.topk)
+def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None):
+  """JAX top-k implementation using jax.lax.top_k for improved efficiency.
+
+  Args:
+      input: The input JAX array.
+      k: The number of top elements to return.
+      dim: The dimension along which to find the top-k. If None, operates on the
+        flattened array.
+      largest: If True, returns the largest k elements. Otherwise, smallest k.
+      sorted: If True, returns the elements in sorted order.
+
+  Returns:
+      A tuple (values, indices) containing:
+          - values: The top k values.
+          - indices: The indices of the top k values in the original array.
+  """
+  if dim is None:
+    # last dim is chosen
+    dim = input.ndim - 1
+
+  if dim < 0:
+    dim = dim + input.ndim
+
+  if not largest:
+    input = -input  # Find top-k of negated input if we want the smallest
+
+  transpose_shape = None
+  if dim != -1 and dim != len(input.shape) - 1:
+    transpose_shape = list(range(len(input.shape)))
+    transpose_shape[dim], transpose_shape[-1] = (
+      transpose_shape[-1],
+      transpose_shape[dim],
+    )
+    input = jnp.transpose(input, transpose_shape)
+
+  values, indices = jax.lax.top_k(input, k)
+
+  if sorted:
+    values = jnp.sort(values, descending=True)
+    indices = jnp.take_along_axis(
+      indices, jnp.argsort(values, axis=-1, descending=True), axis=-1
+    )
+
+  if not largest:
+    values = -values  # Negate values back if we found smallest
+
+  if transpose_shape is not None:
+    values = jnp.transpose(values, transpose_shape)
+    indices = jnp.transpose(indices, transpose_shape)
+
+  return values, indices
+
+
+# aten.trunc
+@op(torch.ops.aten.trunc)
+def _aten_trunc(a):
+  return jnp.trunc(a)
+
+
+@op(torch.ops.aten.unbind)
+@op(torch.ops.aten.unbind_copy)
+def _aten_unbind(a, dim=0):
+  return tuple(
+    _aten_squeeze_dim(jax.lax.index_in_dim(a, i, axis=dim), dim)
+    for i in range(a.shape[dim])
+  )
+
+
+# NOTE: skip aten.upsample_nearest2d and aten.upsample_bilinear2d
+# despite those being core aten ops, they also have decompositions.
+# here we are using torch decompositions.
+
+
+# aten.where
+@op(torch.ops.aten.where.self)
+@op(torch.ops.aten.where.ScalarSelf)
+@op(torch.ops.aten.where.ScalarOther)
+def _aten_where(condition, x, y):
+  return jnp.where(condition, x, y)
+
+
+# aten.to.dtype
+# Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None
+@op(torch.ops.aten.to.dtype)
+def _aten_to_dtype(
+  a, dtype, non_blocking=False, copy=False, memory_format=None
+):
+  if dtype:
+    jaxdtype = tensor.t2j_dtype(dtype)
+  return a.astype(jaxdtype)
+
+
+# aten.to.device
+
+
+# Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False
+@op(torch.ops.aten.var_mean.correction)
+def _aten_var_mean_correction(self, dim=None, correction=None, keepdim=False):
+  return (
+    jnp.var(self, axis=dim, ddof=correction, keepdims=keepdim),
+    jnp.mean(self, dim, keepdims=keepdim),
+  )
+
+
+@op(torch.ops.aten.scalar_tensor)
+def _aten_scalar_tensor(
+  s, dtype=None, layout=None, device=None, pin_memory=None
+):
+  if dtype is not None:
+    dtype = tensor.t2j_dtype(dtype)
+    return jnp.array(s, dtype=dtype)
+  return jnp.array(s)
+
+
+@op(torch.ops.aten.to.device)
+def _aten_to_device(x, device, dtype):
+  return x
+
+
+@op(torch.ops.aten.max_pool2d_with_indices_backward)
+def max_pool2d_with_indices_backward_custom(
+  grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices
+):
+  """
+  Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward.
+
+  Args:
+      grad_output: The gradient tensor from the preceding layer.
+      self: The input tensor on which the original max pooling was performed.
+      kernel_size: The size of the pooling window.
+      stride: The stride of the pooling window.
+      padding: The padding applied during max pooling.
+      dilation: The dilation factor for the pooling operation.
+      ceil_mode: Whether to use ceil or floor when calculating output shapes.
+      indices: The indices of the maximum values, as produced by max_pool2d_with_indices.
+
+  Returns:
+      The calculated gradient with respect to the input (grad_input).
+  """
+
+  kH, kW = kernel_size
+  dH, dW = stride
+  padH, padW = padding
+  dilH, dilW = dilation
+
+  # Calculate output shape (may need adjustment based on ceil_mode)
+  out_shape = jnp.array(self.shape)
+  grad_input = jnp.zeros_like(self)
+
+  # Iterate over the flattened input and output tensors
+  for i, idx in enumerate(indices.flatten()):
+    # Calculate input coordinates corresponding to the maximum value
+    out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3]
+    in_y = out_y * dH - padH + out_y * (dilH - 1)
+    in_x = out_x * dW - padW + out_x * (dilW - 1)
+
+    # Scatter the gradient to the appropriate input locations (handling potential overlaps)
+    for y in range(in_y, in_y + kH):
+      for x in range(in_x, in_x + kW):
+        if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]:
+          grad_input = grad_input.at[y, x].add(grad_output.flatten()[i])
+
+  return grad_input
+
+
+@op(torch.ops.aten._local_scalar_dense)
+def _aten_local_scalar_dense(x):
+  return x.item()
+
+
+@op(torch.ops.aten.tensor_split.sections)
+def _aten_tensor_split(ary, indices_or_sections, axis=0):
+  return jnp.array_split(ary, indices_or_sections, axis)
+
+
+@op(torch.ops.aten.randn, needs_env=True)
+@op_base.convert_dtype()
+def _randn(
+  *size,
+  generator=None,
+  out=None,
+  dtype=None,
+  layout=torch.strided,
+  device=None,
+  requires_grad=False,
+  pin_memory=False,
+  env=None,
+):
+  shape = size
+  if len(shape) == 1 and isinstance(shape[0], (list, tuple)):
+    shape = shape[0]
+  key = env.get_and_rotate_prng_key()
+  res = jax.random.normal(key, shape)
+  if dtype is not None:
+    res = res.astype(dtype)
+  return res
+
+
+@op(torch.ops.aten.rand, needs_env=True)
+@op_base.convert_dtype()
+def _rand(
+  *size,
+  generator=None,
+  out=None,
+  dtype=None,
+  layout=torch.strided,
+  device=None,
+  requires_grad=False,
+  pin_memory=False,
+  env=None,
+):
+  shape = size
+  if len(shape) == 1 and isinstance(shape[0], (list, tuple)):
+    shape = shape[0]
+  key = env.get_and_rotate_prng_key()
+  res = jax.random.uniform(key, shape)
+  if dtype is not None:
+    res = res.astype(dtype)
+  return res
+
+
+@op(torch.ops.aten.scalar_tensor.default)
+def _aten_scalar_tensor(val, **kwargs):
+  p = torch.ops.aten.scalar_tensor(val)
+  return tensor.t2j(p)
+
+
+@op(torch.ops.aten.to.device)
+def _aten_to_device(x, device, dtype):
+  return x
+
+
+@op(torch.ops.aten.max_pool2d_with_indices_backward)
+def max_pool2d_with_indices_backward_custom(
+  grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices
+):
+  """
+  Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward.
+
+  Args:
+      grad_output: The gradient tensor from the preceding layer.
+      self: The input tensor on which the original max pooling was performed.
+      kernel_size: The size of the pooling window.
+      stride: The stride of the pooling window.
+      padding: The padding applied during max pooling.
+      dilation: The dilation factor for the pooling operation.
+      ceil_mode: Whether to use ceil or floor when calculating output shapes.
+      indices: The indices of the maximum values, as produced by max_pool2d_with_indices.
+
+  Returns:
+      The calculated gradient with respect to the input (grad_input).
+  """
+
+  kH, kW = kernel_size
+  dH, dW = stride
+  padH, padW = padding
+  dilH, dilW = dilation
+
+  # Calculate output shape (may need adjustment based on ceil_mode)
+  out_shape = jnp.array(self.shape)
+  grad_input = jnp.zeros_like(self)
+
+  # Iterate over the flattened input and output tensors
+  for i, idx in enumerate(indices.flatten()):
+    # Calculate input coordinates corresponding to the maximum value
+    out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3]
+    in_y = out_y * dH - padH + out_y * (dilH - 1)
+    in_x = out_x * dW - padW + out_x * (dilW - 1)
+
+    # Scatter the gradient to the appropriate input locations (handling potential overlaps)
+    for y in range(in_y, in_y + kH):
+      for x in range(in_x, in_x + kW):
+        if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]:
+          grad_input = grad_input.at[y, x].add(grad_output.flatten()[i])
+
+  return grad_input
+
+
+@op(torch.ops.aten._local_scalar_dense)
+def _aten_local_scalar_dense(x):
+  return x.item()
+
+
+@op(torch.ops.aten.tensor_split.sections)
+def _aten_tensor_split(ary, indices_or_sections, axis=0):
+  return jnp.array_split(ary, indices_or_sections, axis)
+
+
+@op(torch.ops.aten.outer)
+def _aten_outer(a, b):
+  return jnp.outer(a, b)
+
+
+@op(torch.ops.aten.allclose)
+def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False):
+  return jnp.allclose(input, other, rtol, atol, equal_nan)
+
+@op(torch.ops.aten.native_batch_norm)
+def _aten_native_batch_norm(input, weight, bias, running_mean, running_var, training=False, momentum=0.1, eps=1e-5):
+  
+  if running_mean is None:
+    running_mean = jnp.zeros(input.shape[1])  # Initialize running mean if None
+  if running_var is None:
+    running_var = jnp.ones(input.shape[1])   # Initialize running variance if None
+  
+  if training:
+    return torch.ops.aten._native_batch_norm_legit(input, weight, bias, running_mean, running_var, training, momentum, eps)
+  else:
+    return torch.ops.aten._native_batch_norm_legit_no_training(input, weight, bias, running_mean, running_var, momentum, eps) 
diff --git a/experimental/torch_xla2/torch_xla2/ops/jtorch.py b/experimental/torch_xla2/torch_xla2/ops/jtorch.py
index e69de29bb2d..6d7003b936e 100644
--- a/experimental/torch_xla2/torch_xla2/ops/jtorch.py
+++ b/experimental/torch_xla2/torch_xla2/ops/jtorch.py
@@ -0,0 +1,88 @@
+"""Tensor constructor overrides"""
+import functools
+from typing import Optional, Sequence
+
+import jax
+import torch
+import jax.numpy as jnp
+from torch_xla2 import tensor
+from torch_xla2.ops.ops_registry import register_torch_function_op
+from torch_xla2.ops import op_base
+
+
+def register_function(torch_func, **kwargs):
+  return functools.partial(register_torch_function_op, torch_func, **kwargs)
+
+
+@register_function(torch.tensor)
+@op_base.convert_dtype(use_default_dtype=False)  # Attempt to infer type from elements
+def _tensor(data, *, dtype=None, **kwargs):
+  python_types_to_torch_types = {
+      bool: jnp.bool,
+      int: jnp.int64,
+      float: jnp.float32,
+      complex: jnp.complex64,
+  }
+  if not dtype:
+    leaves = jax.tree_util.tree_leaves(data)
+    if len(leaves) > 0:
+      dtype = python_types_to_torch_types.get(type(leaves[0]))
+
+  return jnp.array(
+      data, dtype=dtype or tensor.t2j_dtype(torch.get_default_dtype()))
+
+
+@register_function(torch.ones)
+@op_base.convert_dtype()
+def _ones(*size: int, dtype=None, **kwargs):
+  return jnp.ones(size, dtype)
+
+
+@register_function(torch.zeros)
+@op_base.convert_dtype()
+def _zeros(*size: int, dtype=None, **kwargs):
+  return jnp.zeros(size, dtype)
+
+
+@register_function(torch.eye)
+@op_base.convert_dtype()
+def _eye(n: int, m: Optional[int] = None, *, dtype=None, **kwargs):
+  return jnp.eye(n, m, dtype=dtype)
+
+
+@register_function(torch.full)
+@op_base.convert_dtype()
+def _full(size: Sequence[int], fill_value, *, dtype=None, **kwargs):
+  # TODO: handle torch.Size
+  return jnp.full(size, fill_value, dtype=dtype)
+
+
+@register_function(torch.allclose)
+def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False):
+  return jnp.allclose(input, other, rtol, atol, equal_nan)
+
+@register_function(torch.angle)
+def _torch_angle(input):
+  return jnp.angle(input)
+
+
+@register_function(torch.argsort)
+def _torch_argsort(input, dim=-1, descending=False, stable=False):
+  expanded = False
+  if input == 0:
+    # for self of rank 0:
+    # torch.any(x, 0), torch.any(x, -1) works;
+    # torch.any(x, 1) throws out of bounds, so it's
+    # behavior is the same as a jnp array of rank 1
+    expanded = True
+    input = jnp.expand_dims(input, 0)
+  res = jnp.argsort(input, axis=dim, descending=descending, 
+                     stable=stable)
+  if expanded:
+    res = res.squeeze()
+  return res
+
+@register_function(torch.einsum)
+def _einsum(equation, *operands):
+  assert isinstance(equation, str), 'Only accept str equation'
+  return jnp.einsum(equation, *operands)
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/ops/op_base.py b/experimental/torch_xla2/torch_xla2/ops/op_base.py
index 62df160edc9..8076a11fb09 100644
--- a/experimental/torch_xla2/torch_xla2/ops/op_base.py
+++ b/experimental/torch_xla2/torch_xla2/ops/op_base.py
@@ -1,22 +1,15 @@
+import functools
 import torch
-from torch_xla2 import extra 
+from torch_xla2 import interop, tensor
+from torch_xla2 import types
 
-class JaxOperator:
-    """This is a aten op backed by jax function."""
-
-    def __init__(self, jax_callable):
-        self.jax = jax_callable
-
-    def __call__(self, *args, **kwargs):
-        # args are torch.Tensor
-        res = call_jax(self.jax, args, kwargs)
-        return res
+from typing import Callable, Optional, ParamSpec, Sequence
 
 
 class BinaryOpWithPromotion:
 
-    def __init__(self, jax_callable):
-        self.jax = jax_callable
+    def __init__(self, inner):
+        self.inner = inner 
 
     def _get_dtype(self, obj):
         if isinstance(obj, torch.Tensor):
@@ -31,7 +24,7 @@ def _get_dtype(self, obj):
 
     def __call__(self, *args, **kwargs):
         # args are torch.Tensor
-        res = extra.torch_view(self.jax)(*args, **kwargs)
+        res = interop.torch_view(self.jax)(*args, **kwargs)
 
         dtype = torch.promote_types(
             self._get_dtype(args[0]), 
@@ -41,15 +34,6 @@ def __call__(self, *args, **kwargs):
         return res
 
 
-class TorchLowering:
-
-    def __init__(self, lowering):
-        self.lowering = lowering
-
-    def __call__(self, *args, **kwargs):
-        return self.lowering(*args, **kwargs)
-
-
 class InplaceOp:
 
     def __init__(self, functional_op, position_to_mutate=0):
@@ -58,7 +42,7 @@ def __init__(self, functional_op, position_to_mutate=0):
 
     def __call__(self, *args, **kwargs):
         to_mutate = args[0]
-        to_mutate._elem = self.functional(*args, **kwargs)._elem
+        to_mutate.copy_(self.functional(*args, **kwargs))
         return to_mutate
 
 
@@ -72,4 +56,29 @@ def __call__(self, *args, **kwargs):
 
 
 
+P = ParamSpec('P')
+def convert_dtype(use_default_dtype: bool = True):
+  """Converts `dtype` kwarg of function from torch to JAX.
+
+  Args:
+    use_default_dtype: Whether to use torch default dtype if none is provided.
+
+  Returns:
+    A decorator that wraps a JAX implementation of a torch function.
+  """
+
+  def decorator(func: types.TorchCallable):
+
+    @functools.wraps(func)
+    def wrapper(*args: P.args,
+                dtype: Optional[torch.dtype] = None,
+                **kwargs: P.kwargs):
+      if not dtype and use_default_dtype:
+        dtype = torch.get_default_dtype()
+      jax_dtype = tensor.t2j_dtype(dtype)
+
+      return func(*args, dtype=jax_dtype, **kwargs)
+
+    return wrapper
 
+  return decorator
diff --git a/experimental/torch_xla2/torch_xla2/ops/ops_registry.py b/experimental/torch_xla2/torch_xla2/ops/ops_registry.py
new file mode 100644
index 00000000000..e75d1549456
--- /dev/null
+++ b/experimental/torch_xla2/torch_xla2/ops/ops_registry.py
@@ -0,0 +1,47 @@
+import dataclasses
+from torch_xla2.types import JaxCallable, TorchCallable
+
+from typing import Union, Dict
+
+
+@dataclasses.dataclass
+class Operator:
+    torch_op: TorchCallable
+    func: Union[TorchCallable, JaxCallable]
+    is_jax_function: bool
+    is_user_defined: bool
+    needs_env: bool
+
+
+all_aten_ops: Dict[TorchCallable, Operator] = {}
+all_torch_functions: Dict[TorchCallable, Operator] = {}
+
+
+def register_torch_dispatch_op(
+    aten_op, impl_callable, 
+    is_jax_function=True, 
+    is_user_defined=False,
+    needs_env=False,
+):
+    op = Operator(
+        aten_op, impl_callable, 
+        is_jax_function=is_jax_function,
+        is_user_defined=is_user_defined,
+        needs_env=needs_env)
+    all_aten_ops[aten_op] = op
+    return impl_callable 
+
+
+def register_torch_function_op(
+    torch_func, impl_callable, 
+    is_jax_function=True, 
+    is_user_defined=False,
+    needs_env=False,
+):
+    op = Operator(
+        torch_func, impl_callable, 
+        is_jax_function=is_jax_function,
+        is_user_defined=is_user_defined,
+        needs_env=needs_env)
+    all_torch_functions[torch_func] = op
+    return impl_callable 
\ No newline at end of file
diff --git a/experimental/torch_xla2/torch_xla2/ops_registry.py b/experimental/torch_xla2/torch_xla2/ops_registry.py
deleted file mode 100644
index f1d115864d3..00000000000
--- a/experimental/torch_xla2/torch_xla2/ops_registry.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-import torch._decomp as decomp
-import torch_xla2.decompositions
-
-class LoweringRegistry:
-
-  def __init__(self):
-    self.registered_ops = {}
-    self.decomps = {}
-
-  def lookup(self, op_or_name):
-    candidate = self._lookup(op_or_name)
-    if candidate is None:
-      if isinstance(op_or_name, torch._ops.OpOverloadPacket):
-        candidate = self._lookup(op_or_name.default)
-      if isinstance(op_or_name, torch._ops.OpOverload):
-        candidate = self._lookup(op_or_name.overloadpacket)
-    return candidate
-
-  def _lookup(self, op):
-    candidate = self.registered_ops.get(op)
-    if candidate is None:
-      candidate = self.decomp.get(op)
-    return candidate
-
-  def register(self, op, lowering):
-    if isinstance(op, torch._ops.OpOverloadPacket):
-      if hasattr(op, 'default'):
-        self.registered_ops[op.default] = lowering
-    self.registered_ops[op] = lowering
-
-
-lowerings = LoweringRegistry()
-EXTRA_DECOMP = decomp.get_decompositions([
-    torch.ops.aten.upsample_nearest2d,
-    torch.ops.aten._native_batch_norm_legit.no_stats,
-    torch.ops.aten._adaptive_avg_pool2d,
-    torch.ops.aten._adaptive_avg_pool3d,
-    torch.ops.aten.grid_sampler_2d,
-    torch.ops.aten.native_dropout,
-    torch.ops.aten.reflection_pad1d,
-    torch.ops.aten.reflection_pad2d,
-    torch.ops.aten.reflection_pad3d,
-    torch.ops.aten.replication_pad1d,
-    torch.ops.aten.replication_pad2d,
-    torch.ops.aten.replication_pad3d,
-])
-CORE_ATEN_DECOMP = decomp.core_aten_decompositions()
-CORE_ATEN_DECOMP.update(EXTRA_DECOMP)
-lowerings.decomp = CORE_ATEN_DECOMP
-
-
-def _all_core_ops():
-  """Yields all core ops."""
-  import torch._ops
-
-  for k, v in torch.ops.aten.__dict__.items():
-    if k.startswith('__'):
-      continue
-    if k.startswith('_'):
-      continue
-    if isinstance(v, torch._ops.OpOverloadPacket):
-      for overload in v.overloads():
-        op = getattr(v, overload)
-        if torch.Tag.core in op.tags:
-          yield v
-          break
-
-
-def print_missing_ops():
-  core_aten = set(_all_core_ops())
-  existing = set(lowerings.registered_ops.keys())
-  for v in core_aten - existing:
-    print(v)
diff --git a/experimental/torch_xla2/torch_xla2/tensor.py b/experimental/torch_xla2/torch_xla2/tensor.py
index 98953a8b04c..460e06f8841 100644
--- a/experimental/torch_xla2/torch_xla2/tensor.py
+++ b/experimental/torch_xla2/torch_xla2/tensor.py
@@ -1,53 +1,16 @@
-import functools
+import contextlib
 import jax
 from jax import dlpack as jaxdl
 import jax.numpy as jnp
 import numpy
 import torch
 import torch.func
-import torch._decomp.decompositions
-from torch_xla2 import ops_registry
 import torch.utils._python_dispatch as torch_dispatch
 import torch.utils._pytree as torch_pytree
 import torch.utils.dlpack as torchdl
-from torch_xla2.ops import jaten
-from torch._subclasses.fake_tensor import FakeTensorMode
 
-fake_mode = FakeTensorMode()
-
-
-class XLADispatchMode(torch_dispatch.TorchDispatchMode):
-
-  def __torch_dispatch__(self, fn, types, args=(), kwargs=None):
-    if fn in constructors:
-      args, kwargs = unwrap((args, kwargs))
-      res = constructors[fn](*args, **kwargs)
-      return wrap(res)
-
-    return fn(*args, **kwargs)
-
-
-def _aten_arange(start,
-                 end,
-                 *,
-                 dtype=None,
-                 layout=None,
-                 requires_grad=False,
-                 device=None,
-                 pin_memory=False):
-  return jnp.arange(start, end, 1)
-
-
-def _aten_scalar_tensor(val, **kwargs):
-  p = torch.ops.aten.scalar_tensor(val)
-  return wrap(t2j(p))
-
-
-constructors = {
-    torch.ops.aten.scalar_tensor.default: _aten_scalar_tensor,
-    torch.ops.aten.arange.default: functools.partial(_aten_arange, 0),
-    torch.ops.aten.arange.start: _aten_arange,
-}
+class OperatorNotFound(Exception):
+  pass
 
 
 def wrap(jaxarray):
@@ -61,7 +24,9 @@ def unwrap(torchtensors):
 def t2j(t):
   if isinstance(t, XLATensor2):
     return t._elem
+  is_bool = False
   if t.dtype == torch.bool:
+    is_bool = True
     t = t.to(torch.int8)
 
   if not t.is_contiguous():
@@ -82,7 +47,7 @@ def t2j(t):
     if t.dtype == torch.bfloat16:
       res = res.astype(jnp.bfloat16)
 
-  if t.dtype == torch.bool:
+  if is_bool:
     res = res.astype(jnp.bool_)
   return res
 
@@ -97,48 +62,59 @@ def j2t(x):
     res = res.to(torch.bool)
   return res
 
+TORCH_DTYPE_TO_JAX = {
+    # NO_MAPPING        : jnp.float0.dtype (signless scalar int),
+    torch.bool          : jnp.bool_.dtype,
+    # NO_MAPPING        : jnp.int4.dtype,
+    torch.int8          : jnp.int8.dtype,
+    torch.int16         : jnp.int16.dtype,
+    torch.int32         : jnp.int32.dtype,
+    torch.int64         : jnp.int64.dtype,
+    torch.long          : jnp.int64.dtype,
+    # NO_MAPPING        : jnp.uint4
+    torch.uint8         : jnp.uint8.dtype,
+    torch.uint16        : jnp.uint16.dtype,
+    torch.uint32        : jnp.uint32.dtype,
+    torch.uint64        : jnp.uint64.dtype,
+    # NO_MAPPING        : jnp.float8_e4m3b11fnuz.dtype,
+    torch.float8_e4m3fn : jnp.float8_e4m3fn.dtype,
+    # NO_MAPPING        : jnp.float8_e4m3fnuz.dtype,
+    torch.float8_e5m2   : jnp.float8_e5m2.dtype,
+    # NO_MAPPING        : jnp.float8_e5m2fnuz.dtype,
+    torch.bfloat16      : jnp.bfloat16.dtype,
+    torch.half          : jnp.float16.dtype,
+    torch.float16       : jnp.float16.dtype,
+    torch.float32       : jnp.float32.dtype,
+    torch.float64       : jnp.float64.dtype,
+    torch.double        : jnp.double.dtype,
+    torch.complex64     : jnp.complex64.dtype,
+    torch.complex128    : jnp.complex128.dtype,
+    None                : None,
+}
+
+JAX_DTYPE_TO_TORCH = {
+  value: key for key, value in TORCH_DTYPE_TO_JAX.items()
+}
+# Add imprecise mappings for some JAX dtypes which don't have torch analogues
+JAX_DTYPE_TO_TORCH[jnp.dtype('int4')] = torch.int8
+JAX_DTYPE_TO_TORCH[jnp.dtype('uint4')] = torch.uint8
 
 def t2j_dtype(dtype):
-  return {
-      torch.float16: jnp.float16,
-      torch.bfloat16: jnp.bfloat16,
-      torch.half: jnp.float16,
-      torch.float32: jnp.float32,
-      torch.double: jnp.double,
-      torch.long: jnp.int64,
-      torch.int32: jnp.int32,
-      torch.int16: jnp.int16,
-      torch.int8: jnp.int8,
-      torch.uint8: jnp.uint8,
-      torch.bool: jnp.bool_,
-      torch.complex64: jnp.complex64,
-      torch.complex128: jnp.complex128,
-  }.get(dtype)
+  if dtype not in TORCH_DTYPE_TO_JAX:
+    raise RuntimeError(f'Attempting to convert unknown type: {dtype} to torch type,')
+  return TORCH_DTYPE_TO_JAX[dtype]
 
 
 def j2t_dtype(dtype):
-  return {
-      jnp.float16: torch.float16,
-      jnp.bfloat16: torch.bfloat16,
-      jnp.double: torch.double,
-      jnp.float32: torch.float32,
-      jnp.float16: torch.half,
-      jnp.int64: torch.long,
-      jnp.int32: torch.int32,
-      jnp.int16: torch.int16,
-      jnp.bool_: torch.bool,
-      jnp.complex64: torch.complex64,
-  }.get(dtype)
-
-
-def move_to_device(t):
-  return XLATensor2(t2j(t))
+  if dtype not in JAX_DTYPE_TO_TORCH:
+    raise RuntimeError(f'Attempting to convert unknown type: {dtype} to torch type,')
+  return JAX_DTYPE_TO_TORCH[dtype]
 
 
 class XLATensor2(torch.Tensor):
 
   @staticmethod
-  def __new__(cls, elem):
+  def __new__(cls, elem, env):
     dtype = j2t_dtype(elem.dtype)
     shape = list(elem.shape)
     for i, s in enumerate(shape):
@@ -154,9 +130,10 @@ def __new__(cls, elem):
         requires_grad=False,
     )
 
-  def __init__(self, elem: jax.Array):
+  def __init__(self, elem: jax.Array, env: 'Environment'):
     super().__init__()
     self._elem = elem
+    self._env = env
 
   def __str__(self):
     return "XLATensor2({} {})".format(str(type(self._elem)), str(self._elem))
@@ -178,7 +155,7 @@ def flatten(self, start_dim=0, end_dim=-1):
     new_shape = (
         self._elem.shape[:start_dim] + (-1,) + self._elem.shape[end_dim:])
     new_elem = jnp.reshape(self._elem, new_shape)
-    return XLATensor2(new_elem)
+    return XLATensor2(new_elem, self._env)
     # return torch.reshape(self, new_shape)
 
   def __setitem__(self, key, val):
@@ -193,32 +170,17 @@ def type_as(self, other):
 
   @classmethod
   def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-    kwargs = kwargs or {}
-    with jax.named_scope(func.name()):
-
-      if isinstance(func, torch._ops.OpOverloadPacket):
-        return func(*args, **kwargs)
-
-      if func.name() == 'aten::copy_':
-        x, y = args
-        x._elem = y._elem
-        return
-
-      if func.overloadpacket in jaten.all_ops:
-        return jaten.all_ops[func.overloadpacket](*args, **kwargs)
-
-      lowering = ops_registry.lowerings.lookup(func)
+    env = None
+    for arg in torch_pytree.arg_tree_leaves(*args, **kwargs):
+      if isinstance(arg, XLATensor2):
+        env = arg._env
+        break
 
-      if lowering is None:
-        raise RuntimeError("No lowering found for", func.name())
-
-      with XLADispatchMode():
-        res = lowering(*args, **kwargs)
-      debug_accuracy(func, args, kwargs, res)
-      return res
+    with env:
+      return func(*args, **(kwargs or {}))
 
   def detach(self):
-    return XLATensor2(jax.lax.stop_gradient(self.jax()))
+    return XLATensor2(jax.lax.stop_gradient(self.jax()), self._env)
 
   def numpy(self) -> numpy.ndarray:
     import numpy as np
@@ -231,6 +193,20 @@ def jax(self) -> jax.Array:
   def torch(self) -> torch.Tensor:
     return j2t(self.jax())
 
+  def to(self, *args, **kwargs):
+    if len(args) == 1:
+      if isinstance(args[0], torch.dtype):
+        return XLATensor2(self._elem.astype(t2j_dtype(args[0])), self._env)
+    if 'dtype' in kwargs:
+      dtype = kwargs['dtype']
+      return XLATensor2(self._elem.astype(t2j_dtype(dtype)), self._env)
+    return self
+
+  @property
+  def dtype(self):
+    return j2t_dtype(self._elem.dtype)
+
+
 
 # TODO: slice of slice should also be another slice
 class SliceView(XLATensor2):
@@ -281,3 +257,159 @@ def debug_accuracy(func, args, kwargs, current_output):
       pdb.set_trace()
 
   return True
+
+
+class XLAFunctionMode(torch.overrides.TorchFunctionMode):
+  """Context manager that dispatches torch function calls to JAX."""
+
+  def __init__(self, env):
+     self.env = env
+
+  def __torch_function__(self,
+                         func,
+                         types,
+                         args=(),
+                         kwargs=None) -> torch.Tensor:
+    try:
+      return self.env.dispatch(func, types, args, kwargs)
+    except OperatorNotFound:
+      return func(*args, **(kwargs or {}))
+
+
+class XLADispatchMode(torch_dispatch.TorchDispatchMode):
+
+  def __init__(self, env):
+    self.env = env
+
+  def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+    if isinstance(func, torch._ops.OpOverloadPacket):
+      with self:
+        return func(*args, **kwargs)
+    if func.namespace != 'aten':
+      return func(*args, **kwargs)
+    return self.env.dispatch(func, types, args, kwargs)
+
+def _name_of_func(func):
+  if hasattr(func, 'name'):
+    return func.name()
+  return func.__name__
+
+
+class Environment(contextlib.ContextDecorator):
+    """This class holds a set of configurations and "globals" needed
+
+    for executing torch program using jax.
+    Things included so far:
+
+    op registry
+    PRNGKey
+    Configs
+
+    Also helper functions to manipulate those.
+    """
+
+    _prng_key: jax.random.PRNGKey
+
+
+    def __init__(self, random_seed):
+        self._prng_key = jax.random.PRNGKey(random_seed)
+        self._function_mode = XLAFunctionMode(self)
+        self._dispatch_mode = XLADispatchMode(self)
+
+        # name is torch callable
+        self._ops = {}
+        self.load_ops()
+
+    def load_ops(self):
+      from torch_xla2.ops import jaten, jtorch, ops_registry
+      self._ops.update(ops_registry.all_aten_ops)
+      self._ops.update(ops_registry.all_torch_functions)
+
+      decomps = torch._decomp.core_aten_decompositions() 
+      from torch_xla2.decompositions import EXTRA_DECOMP
+      decomps.update(EXTRA_DECOMP)
+      for k, v in decomps.items():
+        if k not in self._ops:
+          self._ops[k] = ops_registry.Operator(
+            k,
+            v,
+            is_jax_function=False,
+            is_user_defined=False,
+            needs_env=False
+          )
+
+    def get_and_rotate_prng_key(self):
+        self._prng_key, key = jax.random.split(self._prng_key)
+        return key
+
+    def dispatch(self, func, types, args, kwargs):
+      with jax.named_scope(_name_of_func(func)):
+        kwargs = kwargs or {}
+        op = self._ops.get(func)
+
+        if op is None and isinstance(func, torch._ops.OpOverloadPacket):
+          op = self._ops.get(func.default)
+
+        if op is None and isinstance(func, torch._ops.OpOverload):
+          op = self._ops.get(func.overloadpacket)
+
+        if op is None:
+          raise OperatorNotFound(
+            f'Operator with name {_name_of_func(func)} has no lowering')
+
+        if op.is_jax_function:
+          args, kwargs = self.t2j_iso((args, kwargs))
+
+        if op.needs_env:
+          kwargs['env'] = self
+
+        with self:
+          res = op.func(*args, **kwargs)
+
+        if op.is_jax_function:
+          res = self.j2t_iso(res)
+        
+        #if self.config.debug_accuracy_for_each_op:
+        #  debug_accuracy(func, args, kwargs, res)
+        return res
+
+    def __enter__(self):
+      self._dispatch_mode.__enter__()
+      self._function_mode.__enter__()
+      return self
+
+    def __exit__(self, *exc):
+      self._function_mode.__exit__(*exc)
+      self._dispatch_mode.__exit__(*exc)
+
+    def _move_one_value(self, val):
+      if isinstance(val, torch.nn.Module):
+        state_dict = self.to_xla(val.state_dict())
+        val.load_state_dict(state_dict, assign=True)
+        return val
+      if isinstance(val, XLATensor2):
+        return val
+      if isinstance(val, torch.Tensor):
+        return XLATensor2(t2j(val), self)
+      return val
+
+    def to_xla(self, torchvalues):
+      # tensors are torch.Tensors (not XLATensor)
+      res = torch_pytree.tree_map(
+        self._move_one_value,
+          torchvalues)
+      return res
+
+    def t2j_iso(self, torchtensors):
+      return torch_pytree.tree_map_only(
+        XLATensor2, lambda x: x.jax(), torchtensors)
+
+    def j2t_iso(self, jaxarray):
+      return torch_pytree.tree_map_only(
+        jnp.ndarray, lambda x: XLATensor2(x, self), jaxarray)
+
+    def j2t_copy(self, args):
+      pass
+
+    def j2t_copy(self, args):
+      pass
diff --git a/experimental/torch_xla2/torch_xla2/types.py b/experimental/torch_xla2/torch_xla2/types.py
new file mode 100644
index 00000000000..f39d530c18d
--- /dev/null
+++ b/experimental/torch_xla2/torch_xla2/types.py
@@ -0,0 +1,12 @@
+from typing import TypeAlias, Callable, ParamSpec, Any, Union
+import torch
+import jax
+import jax.numpy as jnp
+
+
+P = ParamSpec('P')
+
+TorchValue: TypeAlias = Union[torch.Tensor, torch.dtype, 'TorchCallable', Any]
+TorchCallable: TypeAlias = Callable[P, TorchValue]
+JaxValue: TypeAlias = Union[jax.Array, jnp.dtype, 'JaxCallable', Any]
+JaxCallable: TypeAlias = Callable[P, JaxValue]
\ No newline at end of file
diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
index 15e8dc79d6c..9e2fe7270cc 100644
--- a/infra/ansible/config/env.yaml
+++ b/infra/ansible/config/env.yaml
@@ -14,7 +14,7 @@ release_env:
     TPUVM_MODE: 1
 
   cuda:
-    TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0,9.0
+    TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
     XLA_CUDA: 1
 
 # Variables that will be passed to shell environment only for building PyTorch and XLA libs.
@@ -22,7 +22,7 @@ build_env:
   common:
     LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"
     # Set explicitly to 0 as setup.py defaults this flag to true if unset.
-    BUILD_CPP_TESTS: 0
+    BUILD_CPP_TESTS: "{{ build_cpp_tests }}"
     # Force GCC because clang/bazel has issues.
     CC: gcc-10
     CXX: g++-10
@@ -31,9 +31,9 @@ build_env:
     PYTORCH_BUILD_VERSION: "{{ package_version }}"
     XLA_SANDBOX_BUILD: 1
     BAZEL_REMOTE_CACHE: 1
-    SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}-{{ clang_version }}"
+    SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}-{{ clang_version }}{{ cache_suffix }}"
     _GLIBCXX_USE_CXX11_ABI: 0
-    GIT_VERSIONED_XLA_BUILD: "{{ nightly_release }}"
+    GIT_VERSIONED_XLA_BUILD: "{{ nightly_release or git_versioned_xla_build }}"
 
   amd64:
     ARCH: amd64
@@ -41,7 +41,7 @@ build_env:
   aarch64:
 
   cuda:
-    TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0,9.0
+    TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
     XLA_CUDA: 1
 
   tpu:
diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
index 2347d066e84..e5851d0cc77 100644
--- a/infra/ansible/config/vars.yaml
+++ b/infra/ansible/config/vars.yaml
@@ -1,6 +1,8 @@
 # Used for fetching cuda from the right repo, see apt.yaml.
 cuda_repo: debian11
 cuda_version: "11.8"
+# Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus
+cuda_compute_capabilities: 7.0,7.5,8.0,9.0
 # Used for fetching clang from the right repo, see apt.yaml.
 llvm_debian_repo: bullseye
 clang_version: 17
@@ -10,3 +12,9 @@ package_version: 2.4.0
 nightly_release: false
 # Whether to preinstall libtpu in the PyTorch/XLA wheel. Ignored for GPU build.
 bundle_libtpu: 1
+# Suffix for bazel remote cache key
+cache_suffix: ""
+# Whether to build C++ tests with `torch_xla` wheel
+build_cpp_tests: 0
+# Whether to tag wheels with git hash, e.g. X.Y.Z+git123abc
+git_versioned_xla_build: false
diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml
index d945f150d38..da09a695453 100644
--- a/infra/ansible/roles/build_srcs/tasks/main.yaml
+++ b/infra/ansible/roles/build_srcs/tasks/main.yaml
@@ -1,3 +1,27 @@
+- name: Read PyTorch pin
+  ansible.builtin.command: cat {{ (src_root, 'pytorch/xla/.torch_pin') | path_join }}
+  register: torch_pin
+  # Pin may not exist
+  ignore_errors: true
+
+- name: Checkout PyTorch pin
+  # ansible.builtin.git wants to fetch the entire history, so check out the pin manually
+  ansible.builtin.shell:
+    cmd: |
+      set -xe
+      PIN="{{ torch_pin.stdout }}"
+      if [[ $PIN = \#* ]]; then
+        PRNUM="${PIN//[!0-9]/}"
+        git fetch origin "pull/$PRNUM/head"
+      else
+        git fetch origin {{ torch_pin.stdout }}
+      fi
+      git checkout --recurse-submodules FETCH_HEAD
+    chdir: "{{ (src_root, 'pytorch') | path_join }}"
+  args:
+    executable: /bin/bash
+  when: torch_pin is succeeded
+
 - name: Build PyTorch
   ansible.builtin.command:
     cmd: python setup.py bdist_wheel
@@ -77,6 +101,22 @@
     state: absent
     mode: '0755'
 
+- name: Create temp directory for C++ tests
+  ansible.builtin.file:
+    path: /tmp/test/bin
+    state: directory
+    mode: '0755'
+  when: build_cpp_tests
+
+- name: Collect C++ test files
+  ansible.builtin.shell: |
+    cd pytorch/xla/build/temp*
+    bazel query 'kind(".*_test", tests(//:cpp_tests))' --output=label | xargs -n 1 bazel cquery --output=files | xargs cp -t /tmp/test/bin
+  args:
+    executable: bash
+    chdir: "{{ src_root }}"
+  when: build_cpp_tests
+
 - name: Read Torchvision pin
   ansible.builtin.command: cat {{ (src_root, 'pytorch') | path_join }}/.github/ci_commit_pins/vision.txt
   register: torchvision_pin
diff --git a/infra/tpu-pytorch-releases/artifacts.auto.tfvars b/infra/tpu-pytorch-releases/artifacts.auto.tfvars
index 0229a79c190..c2617739f45 100644
--- a/infra/tpu-pytorch-releases/artifacts.auto.tfvars
+++ b/infra/tpu-pytorch-releases/artifacts.auto.tfvars
@@ -35,62 +35,70 @@ nightly_builds = [
 versioned_builds = [
   # Remove libtpu from PyPI builds
   {
-    git_tag         = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    package_version = "2.3.0"
+    pytorch_git_rev = "v2.3.0"
     accelerator     = "tpu"
     python_version = "3.8"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    package_version = "2.3.0"
+    pytorch_git_rev = "v2.3.0"
     accelerator     = "tpu"
     python_version  = "3.9"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    package_version = "2.3.0"
+    pytorch_git_rev = "v2.3.0"
     accelerator     = "tpu"
     python_version  = "3.10"
     bundle_libtpu   = "0"
   },
   {
-    git_tag         = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    package_version = "2.3.0"
+    pytorch_git_rev = "v2.3.0"
     accelerator     = "tpu"
     python_version  = "3.11"
     bundle_libtpu   = "0"
   },
   # Bundle libtpu for Kaggle
   {
-    git_tag         = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12+libtpu"
-    pytorch_git_rev = "v2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    package_version = "2.3.0+libtpu"
+    pytorch_git_rev = "v2.3.0"
     accelerator     = "tpu"
     python_version  = "3.10"
     bundle_libtpu   = "1"
   },
   {
-    git_tag         = "v2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    pytorch_git_rev = "v2.3.0"
+    package_version = "2.3.0"
     accelerator     = "cuda"
     cuda_version    = "12.1"
     python_version = "3.8"
   },
   {
-    git_tag         = "v2.3.0-rc12"
-    pytorch_git_rev = "v2.3.0-rc12"
-    package_version = "2.3.0-rc12"
+    git_tag         = "v2.3.0"
+    pytorch_git_rev = "v2.3.0"
+    package_version = "2.3.0"
     accelerator     = "cuda"
     cuda_version    = "12.1"
     python_version  = "3.10"
   },
+  {
+    git_tag         = "v2.3.0"
+    pytorch_git_rev = "v2.3.0"
+    package_version = "2.3.0"
+    accelerator     = "cuda"
+    cuda_version    = "12.1"
+    python_version  = "3.11"
+  },
   # Remove libtpu from PyPI builds
   {
     git_tag         = "v2.2.0"
diff --git a/infra/tpu-pytorch-releases/dev_images.tf b/infra/tpu-pytorch-releases/dev_images.tf
index 023ac8b870a..54c340809ef 100644
--- a/infra/tpu-pytorch-releases/dev_images.tf
+++ b/infra/tpu-pytorch-releases/dev_images.tf
@@ -36,8 +36,6 @@ module "dev_images" {
   image_name = "development"
   image_tags = concat(each.value.extra_tags, [
     each.key,
-    # Append _YYYYMMDD suffix to the dev image name.
-    "${each.key}_$(date +%Y%m%d)",
   ])
 
   dockerfile = "development.Dockerfile"
diff --git a/plugins/cuda/torch_xla_cuda_plugin/__init__.py b/plugins/cuda/torch_xla_cuda_plugin/__init__.py
index 9321d26a1a6..e6863ff711a 100644
--- a/plugins/cuda/torch_xla_cuda_plugin/__init__.py
+++ b/plugins/cuda/torch_xla_cuda_plugin/__init__.py
@@ -27,6 +27,9 @@ def physical_chip_count(self) -> int:
     # TODO: default to actual device count
     return xu.getenv_as('GPU_NUM_DEVICES', int, 1)
 
+  def configure_single_process(self):
+    pass
+
   def client_create_options(self) -> dict:
     local_process_rank, global_process_rank = self._get_process_rank()
     local_world_size, global_world_size = self._get_world_size()
diff --git a/requirements.in b/requirements.in
new file mode 100644
index 00000000000..0dafeec6331
--- /dev/null
+++ b/requirements.in
@@ -0,0 +1,9 @@
+filelock
+fsspec
+jinja2
+markupsafe
+mpmath
+networkx
+pyyaml
+sympy
+typing-extensions
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
new file mode 100644
index 00000000000..225f30d1443
--- /dev/null
+++ b/requirements_lock_3_10.txt
@@ -0,0 +1,153 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    bazel run //:requirements.update
+#
+filelock==3.14.0 \
+    --hash=sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f \
+    --hash=sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a
+    # via -r requirements.in
+fsspec==2024.5.0 \
+    --hash=sha256:1d021b0b0f933e3b3029ed808eb400c08ba101ca2de4b3483fbc9ca23fcee94a \
+    --hash=sha256:e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c
+    # via -r requirements.in
+jinja2==3.1.4 \
+    --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
+    --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
+    # via -r requirements.in
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
+    # via
+    #   -r requirements.in
+    #   jinja2
+mpmath==1.3.0 \
+    --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
+    --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
+    # via
+    #   -r requirements.in
+    #   sympy
+networkx==3.3 \
+    --hash=sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9 \
+    --hash=sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2
+    # via -r requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
new file mode 100644
index 00000000000..78862541e94
--- /dev/null
+++ b/requirements_lock_3_11.txt
@@ -0,0 +1,153 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    bazel run //:requirements.update
+#
+filelock==3.14.0 \
+    --hash=sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f \
+    --hash=sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a
+    # via -r requirements.in
+fsspec==2024.5.0 \
+    --hash=sha256:1d021b0b0f933e3b3029ed808eb400c08ba101ca2de4b3483fbc9ca23fcee94a \
+    --hash=sha256:e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c
+    # via -r requirements.in
+jinja2==3.1.4 \
+    --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
+    --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
+    # via -r requirements.in
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
+    # via
+    #   -r requirements.in
+    #   jinja2
+mpmath==1.3.0 \
+    --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
+    --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
+    # via
+    #   -r requirements.in
+    #   sympy
+networkx==3.3 \
+    --hash=sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9 \
+    --hash=sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2
+    # via -r requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
diff --git a/requirements_lock_3_8.txt b/requirements_lock_3_8.txt
new file mode 100644
index 00000000000..022d1e07f3e
--- /dev/null
+++ b/requirements_lock_3_8.txt
@@ -0,0 +1,153 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    bazel run //:requirements.update
+#
+filelock==3.14.0 \
+    --hash=sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f \
+    --hash=sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a
+    # via -r requirements.in
+fsspec==2024.5.0 \
+    --hash=sha256:1d021b0b0f933e3b3029ed808eb400c08ba101ca2de4b3483fbc9ca23fcee94a \
+    --hash=sha256:e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c
+    # via -r requirements.in
+jinja2==3.1.4 \
+    --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
+    --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
+    # via -r requirements.in
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
+    # via
+    #   -r requirements.in
+    #   jinja2
+mpmath==1.3.0 \
+    --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
+    --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
+    # via
+    #   -r requirements.in
+    #   sympy
+networkx==3.1 \
+    --hash=sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36 \
+    --hash=sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61
+    # via -r requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
new file mode 100644
index 00000000000..a01cb47146d
--- /dev/null
+++ b/requirements_lock_3_9.txt
@@ -0,0 +1,153 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    bazel run //:requirements.update
+#
+filelock==3.14.0 \
+    --hash=sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f \
+    --hash=sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a
+    # via -r requirements.in
+fsspec==2024.5.0 \
+    --hash=sha256:1d021b0b0f933e3b3029ed808eb400c08ba101ca2de4b3483fbc9ca23fcee94a \
+    --hash=sha256:e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c
+    # via -r requirements.in
+jinja2==3.1.4 \
+    --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \
+    --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d
+    # via -r requirements.in
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
+    # via
+    #   -r requirements.in
+    #   jinja2
+mpmath==1.3.0 \
+    --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
+    --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
+    # via
+    #   -r requirements.in
+    #   sympy
+networkx==3.2.1 \
+    --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \
+    --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2
+    # via -r requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
diff --git a/scripts/apply_patches.sh b/scripts/apply_patches.sh
index 923b68c79d4..7ba0a3ef8e3 100755
--- a/scripts/apply_patches.sh
+++ b/scripts/apply_patches.sh
@@ -7,7 +7,7 @@ XDIR=$CDIR/..
 PTDIR=$XDIR/..
 OPENXLADIR=$XDIR/third_party/xla
 
-TORCH_PIN="$XDIR/torch_patches/.torch_pin"
+TORCH_PIN="$XDIR/.torch_pin"
 if [ -f "$TORCH_PIN" ]; then
   CID=$(cat "$TORCH_PIN")
   # If starts with # and it's not merged into master, fetch from origin
diff --git a/setup.py b/setup.py
index d45b0b7fc3c..6f81877d0d2 100644
--- a/setup.py
+++ b/setup.py
@@ -64,10 +64,10 @@
 
 base_dir = os.path.dirname(os.path.abspath(__file__))
 
-_date = '20240409'
+_date = '20240527'
 _libtpu_version = f'0.1.dev{_date}'
 _libtpu_storage_path = f'https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-{_libtpu_version}-py3-none-any.whl'
-_jax_version = f'0.4.27.dev{_date}'
+_jax_version = f'0.4.29.dev{_date}'
 
 
 def _get_build_mode():
@@ -223,6 +223,10 @@ def bazel_build(self, ext):
         f"--symlink_prefix={os.path.join(self.build_temp, 'bazel-')}"
     ]
 
+    build_cpp_tests = build_util.check_env_flag('BUILD_CPP_TESTS', default='0')
+    if build_cpp_tests:
+      bazel_argv.append('//:cpp_tests')
+
     import torch
     cxx_abi = os.getenv('CXX_ABI') or getattr(torch._C,
                                               '_GLIBCXX_USE_CXX11_ABI', None)
diff --git a/test/benchmarks/run_tests.sh b/test/benchmarks/run_tests.sh
index 7d404a7ee7f..fce6140a4fe 100755
--- a/test/benchmarks/run_tests.sh
+++ b/test/benchmarks/run_tests.sh
@@ -9,7 +9,7 @@ export PYTHONPATH=$PYTHONPATH:$CDIR/../../benchmarks/
 
 # Note [Keep Going]
 #
-# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CircleCI tests continue on error.
+# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CI tests continue on error.
 # This will allow you to see all the failures on your PR, not stopping with the first
 # test failure like the default behavior.
 CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
@@ -39,10 +39,14 @@ function run_make_tests {
 }
 
 function run_python_tests {
-  python3 "$CDIR/test_experiment_runner.py"
-  python3 "$CDIR/test_benchmark_experiment.py"
-  python3 "$CDIR/test_benchmark_model.py"
-  python3 "$CDIR/test_result_analyzer.py"
+  # HACK: don't confuse local `torch_xla` folder with installed package
+  # Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
+  pushd $CDIR
+  python3 "test_experiment_runner.py"
+  python3 "test_benchmark_experiment.py"
+  python3 "test_benchmark_model.py"
+  python3 "test_result_analyzer.py"
+  popd
 }
 
 function run_tests {
diff --git a/test/benchmarks/test_benchmark_experiment.py b/test/benchmarks/test_benchmark_experiment.py
index 06c004bcf1e..461df56687f 100644
--- a/test/benchmarks/test_benchmark_experiment.py
+++ b/test/benchmarks/test_benchmark_experiment.py
@@ -6,15 +6,16 @@
 class BenchmarkExperimentTest(unittest.TestCase):
 
   def test_to_dict(self):
-    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla",
+    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None,
                              "train", "123")
     actual = be.to_dict()
-    self.assertEqual(7, len(actual))
+    self.assertEqual(8, len(actual))
     self.assertEqual("cpu", actual["accelerator"])
     self.assertTrue("accelerator_model" in actual)
     self.assertEqual("PJRT", actual["xla"])
     self.assertEqual("some xla_flags", actual["xla_flags"])
     self.assertEqual("openxla", actual["dynamo"])
+    self.assertEqual(None, actual["torch_xla2"])
     self.assertEqual("train", actual["test"])
     self.assertEqual("123", actual["batch_size"])
 
diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py
index 3f6a32a168e..81386d7d82d 100644
--- a/test/benchmarks/test_experiment_runner.py
+++ b/test/benchmarks/test_experiment_runner.py
@@ -29,10 +29,10 @@ def test_dummy_dry_run(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -57,10 +57,10 @@ def test_dummy_dry_run_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -85,8 +85,8 @@ def test_dummy_dry_run_inductor_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -113,11 +113,11 @@ def test_dummy_openxla_eval_train_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 5",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -139,15 +139,15 @@ def test_dummy_dynamo_none_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 9",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
diff --git a/test/cpp/run_tests.sh b/test/cpp/run_tests.sh
index 74244322840..d6b492dc694 100755
--- a/test/cpp/run_tests.sh
+++ b/test/cpp/run_tests.sh
@@ -5,7 +5,7 @@ BUILDTYPE="opt"
 VERB=
 FILTER=
 LOGFILE=/tmp/pytorch_cpp_test.log
-XLA_EXPERIMENTAL="nonzero:masked_select"
+XLA_EXPERIMENTAL="nonzero:masked_select:nms"
 BAZEL_REMOTE_CACHE="0"
 BAZEL_VERB="test"
 
diff --git a/test/cpp/test_aten_xla_tensor_4.cpp b/test/cpp/test_aten_xla_tensor_4.cpp
index ff6130ca1b9..7a02a1079a6 100644
--- a/test/cpp/test_aten_xla_tensor_4.cpp
+++ b/test/cpp/test_aten_xla_tensor_4.cpp
@@ -504,6 +504,19 @@ TEST_F(AtenXlaTensorTest, TestDivScalar) {
   ExpectCounterChanged("xla::div", cpp_test::GetIgnoredCounters());
 }
 
+TEST_F(AtenXlaTensorTest, TestDivScalarHalfOverflow) {
+  torch::Tensor input = torch::rand({3, 4}, torch::TensorOptions(torch::kHalf));
+  torch::Scalar other = torch::Scalar(100000);
+  torch::Tensor out = torch::div(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xla_input = CopyToDevice(input, device);
+    torch::Tensor xla_out = torch::div(xla_input, other);
+    AllClose(out, xla_out);
+  });
+  ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
+  ExpectCounterChanged("xla::div", cpp_test::GetIgnoredCounters());
+}
+
 TEST_F(AtenXlaTensorTest, TestDivScalarInPlace) {
   for (torch::ScalarType scalar_type : {torch::kFloat}) {
     torch::Tensor a =
diff --git a/test/cpp/test_aten_xla_tensor_5.cpp b/test/cpp/test_aten_xla_tensor_5.cpp
index 4070779529f..07e4c2dae86 100644
--- a/test/cpp/test_aten_xla_tensor_5.cpp
+++ b/test/cpp/test_aten_xla_tensor_5.cpp
@@ -267,6 +267,27 @@ TEST_F(AtenXlaTensorTest, TestEmbedding) {
   });
 }
 
+TEST_F(AtenXlaTensorTest, TestEmbeddingBag) {
+  torch::Tensor weight =
+      torch::rand({32, 4}, torch::TensorOptions(torch::kFloat));
+  torch::Tensor indices =
+      torch::randint(0, 31, {10}, torch::TensorOptions(torch::kLong));
+  torch::Tensor offsets = torch::arange(0, 10, 3);
+  auto out = torch::embedding_bag(weight, indices, offsets);
+  torch::Tensor result = std::get<0>(out);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xla_weight = CopyToDevice(weight, device);
+    torch::Tensor xla_indices = CopyToDevice(indices, device);
+    torch::Tensor xla_offsets = CopyToDevice(offsets, device);
+    auto xla_out = torch::embedding_bag(xla_weight, xla_indices, xla_offsets);
+    torch::Tensor xla_result = std::get<0>(xla_out);
+    AllClose(result, xla_result);
+    ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
+    ExpectCounterChanged("xla::_embedding_bag_forward_only",
+                         cpp_test::GetIgnoredCounters());
+  });
+}
+
 TEST_F(AtenXlaTensorTest, TestOneHot) {
   int num_classes = 5;
   torch::Tensor input =
diff --git a/test/debug_tool/extract_debug_helper.py b/test/debug_tool/extract_debug_helper.py
index 31e72fdda51..af82545192a 100644
--- a/test/debug_tool/extract_debug_helper.py
+++ b/test/debug_tool/extract_debug_helper.py
@@ -28,6 +28,14 @@ class GraphInfo(NamedTuple):
   num_output: int
 
 
+class PostCompilationInfo(NamedTuple):
+  input_size: str
+  output_size: str
+  aliased_size: str
+  intermediate_size: str
+  program_size: str
+
+
 def extract_graph_infos(lines):
   infos = []
   for i in range(len(lines)):
@@ -42,6 +50,28 @@ def extract_graph_infos(lines):
   return infos
 
 
+def extract_post_compilation_analysis(lines):
+  infos = []
+  i = 0
+  while i < len(lines):
+    if 'Post Compilation Analysis' in lines[i].decode():
+      input_size = lines[i + 1].decode().split('Graph input size: ')[1].strip()
+      output_size = lines[i +
+                          2].decode().split('Graph output size: ')[1].strip()
+      aliased_size = lines[i +
+                           3].decode().split('Aliased Input size: ')[1].strip()
+      intermediate_size = lines[i + 4].decode().split(
+          'Intermediate tensor size: ')[1].strip()
+      program_size = lines[i + 5].decode().split(
+          'Compiled program size: ')[1].strip()
+      infos.append(
+          PostCompilationInfo(input_size, output_size, aliased_size,
+                              intermediate_size, program_size))
+      i += 7
+    i += 1
+  return infos
+
+
 def extract_python_frames(lines):
   frames = []
   current_frame = ''
diff --git a/test/debug_tool/test_mp_pt_xla_debug.py b/test/debug_tool/test_mp_pt_xla_debug.py
index 7ab7f12e740..15f82d11fe1 100644
--- a/test/debug_tool/test_mp_pt_xla_debug.py
+++ b/test/debug_tool/test_mp_pt_xla_debug.py
@@ -32,14 +32,14 @@ def _mp_fn(index):
     # only the local master process should dump the executation analysis
     assert (len(causes) == 1)
     assert ('user mark_step' in causes[0])
-    assert (len(frames) == 2)
+    assert (len(frames) == 3)
     max_frame = os.getenv('PT_XLA_DEBUG_MAX_FRAME', 8)
     # Additonal lines are
     # 1. Python Frame Triggered Execution:
     # 2. ....
     # 3. empty line
     assert (len(frames[0].split('\n')) == max_frame + 3)
-    assert (len(frames[1].split('\n')) == max_frame + 3)
+    assert (len(frames[2].split('\n')) == max_frame + 3)
 
 
 if __name__ == '__main__':
diff --git a/test/debug_tool/test_pt_xla_debug.py b/test/debug_tool/test_pt_xla_debug.py
index 98a694796a0..ed83b236a11 100644
--- a/test/debug_tool/test_pt_xla_debug.py
+++ b/test/debug_tool/test_pt_xla_debug.py
@@ -9,15 +9,20 @@
 import torch_xla.distributed.parallel_loader as pl
 import unittest
 from extract_debug_helper import (check_env_flag, extract_execution_cause,
-                                  extract_compilation_cause, GraphInfo,
-                                  extract_graph_infos, extract_python_frames)
+                                  extract_compilation_cause,
+                                  extract_graph_infos, extract_python_frames,
+                                  extract_post_compilation_analysis)
 
 
 class PtXLADebugTest(unittest.TestCase):
 
   @classmethod
   def setUpClass(cls):
-    if not check_env_flag('PT_XLA_DEBUG'):
+    pt_xla_debug_enabled = xu.getenv_as('PT_XLA_DEBUG', bool, False)
+    cls.debug_level = xu.getenv_as('PT_XLA_DEBUG_LEVEL', int, -1)
+    cls.debug_level = 100 if (cls.debug_level == -1 and
+                              pt_xla_debug_enabled) else cls.debug_level
+    if not check_env_flag('PT_XLA_DEBUG') and cls.debug_level == -1:
       assert False, "This test should be run with PT_XLA_DEBUG"
     cls.debug_file_name = os.getenv('PT_XLA_DEBUG_FILE')
     if not cls.debug_file_name:
@@ -33,19 +38,33 @@ def test_user_mark_step(self):
       executation_causes = extract_execution_cause(lines)
       compilation_causes = extract_compilation_cause(lines)
       graph_infos = extract_graph_infos(lines)
+      post_compilation_infos = extract_post_compilation_analysis(lines)
 
-    self.assertEqual(len(executation_causes), 1)
-    self.assertIn('user mark_step', executation_causes[0])
+    self.assertEqual(len(post_compilation_infos), 1)
+    self.assertIn('GB', post_compilation_infos[0].input_size)
+    self.assertIn('GB', post_compilation_infos[0].output_size)
+    self.assertIn('GB', post_compilation_infos[0].aliased_size)
+    self.assertIn('GB', post_compilation_infos[0].intermediate_size)
+    self.assertIn('GB', post_compilation_infos[0].program_size)
+
+    if self.debug_level > 1:
+      self.assertEqual(len(executation_causes), 1)
+      self.assertIn('user mark_step', executation_causes[0])
+    else:
+      self.assertEqual(len(executation_causes), 0)
 
     self.assertEqual(len(compilation_causes), 1)
     self.assertIn('user mark_step', compilation_causes[0])
 
-    self.assertEqual(len(graph_infos), 2)
-    # one graph info from compilation, one from execution, hash should match
-    self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    if self.debug_level > 1:
+      self.assertEqual(len(graph_infos), 2)
+      # one graph info from compilation, one from execution, hash should match
+      self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    else:
+      self.assertEqual(len(graph_infos), 1)
     # this graph has one input(random seed) and one output(t1)
-    self.assertEqual(graph_infos[1].num_input, 1)
-    self.assertEqual(graph_infos[1].num_output, 1)
+    self.assertEqual(graph_infos[0].num_input, 1)
+    self.assertEqual(graph_infos[0].num_output, 1)
     open(self.debug_file_name, 'w').close()
 
   def test_step_trace(self):
@@ -58,20 +77,26 @@ def test_step_trace(self):
       compilation_causes = extract_compilation_cause(lines)
       graph_infos = extract_graph_infos(lines)
 
-    self.assertEqual(len(causes), 1)
-    self.assertIn('mark_step when exiting a profiler StepTrace region',
-                  causes[0])
+    if self.debug_level > 1:
+      self.assertEqual(len(causes), 1)
+      self.assertIn('mark_step when exiting a profiler StepTrace region',
+                    causes[0])
+    else:
+      self.assertEqual(len(causes), 0)
 
     self.assertEqual(len(compilation_causes), 1)
     self.assertIn('mark_step when exiting a profiler StepTrace region',
                   compilation_causes[0])
 
-    self.assertEqual(len(graph_infos), 2)
-    # one graph info from compilation, one from execution, hash should match
-    self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    if self.debug_level > 1:
+      self.assertEqual(len(graph_infos), 2)
+      # one graph info from compilation, one from execution, hash should match
+      self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    else:
+      self.assertEqual(len(graph_infos), 1)
     # this graph has one input(random seed) and one output(t1)
-    self.assertEqual(graph_infos[1].num_input, 1)
-    self.assertEqual(graph_infos[1].num_output, 1)
+    self.assertEqual(graph_infos[0].num_input, 1)
+    self.assertEqual(graph_infos[0].num_output, 1)
     open(self.debug_file_name, 'w').close()
 
   def test_dynamo(self):
@@ -89,11 +114,14 @@ def toy_program(t1):
       compilation_causes = extract_compilation_cause(lines)
       graph_infos = extract_graph_infos(lines)
 
-    self.assertEqual(len(executation_causes), 2)
-    self.assertIn('mark_step when dynamo processing input graphs',
-                  executation_causes[0])
-    self.assertIn('dynamo is executing a compiled program',
-                  executation_causes[1])
+    if self.debug_level > 1:
+      self.assertEqual(len(executation_causes), 2)
+      self.assertIn('mark_step when dynamo processing input graphs',
+                    executation_causes[0])
+      self.assertIn('dynamo is executing a compiled program',
+                    executation_causes[1])
+    else:
+      self.assertEqual(len(executation_causes), 0)
 
     self.assertEqual(len(compilation_causes), 2)
     self.assertIn('mark_step when dynamo processing input graphs',
@@ -101,17 +129,24 @@ def toy_program(t1):
     self.assertIn('dynamo is compiling a FX graph to HLO',
                   compilation_causes[1])
 
-    # one graph info from compilation, one from execution, hash should match
-    self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    if self.debug_level > 1:
+      # one graph info from compilation, one from execution, hash should match
+      self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
     # this graph has one input(random seed) and one output(t1)
-    self.assertEqual(graph_infos[1].num_input, 1)
-    self.assertEqual(graph_infos[1].num_output, 1)
-
-    # one graph info from dynamo compilation, one from dynamo execution, hash should match
-    self.assertEqual(graph_infos[2].hash, graph_infos[3].hash)
-    # this graph has two input(t1, 100) and one output
-    self.assertEqual(graph_infos[3].num_input, 2)
-    self.assertEqual(graph_infos[3].num_output, 1)
+    self.assertEqual(graph_infos[0].num_input, 1)
+    self.assertEqual(graph_infos[0].num_output, 1)
+
+    if self.debug_level > 1:
+      # one graph info from dynamo compilation, one from dynamo execution, hash should match
+      self.assertEqual(graph_infos[2].hash, graph_infos[3].hash)
+      # this graph has two input(t1, 100) and one output
+      self.assertEqual(graph_infos[3].num_input, 2)
+      self.assertEqual(graph_infos[3].num_output, 1)
+    else:
+      # this graph has two input(t1, 100) and one output
+      self.assertEqual(graph_infos[1].num_input, 2)
+      self.assertEqual(graph_infos[1].num_output, 1)
+
     open(self.debug_file_name, 'w').close()
 
   def test_parallel_loader(self):
@@ -140,22 +175,26 @@ def test_parallel_loader(self):
       compilation_causes = extract_compilation_cause(lines)
       graph_infos = extract_graph_infos(lines)
 
-    self.assertEqual(len(executation_causes), batch_size)
-    for cause in executation_causes:
-      self.assertIn('mark_step in parallel loader at step end', cause)
+    if self.debug_level > 1:
+      self.assertEqual(len(executation_causes), batch_size)
+      for cause in executation_causes:
+        self.assertIn('mark_step in parallel loader at step end', cause)
+    else:
+      self.assertEqual(len(executation_causes), 0)
 
     # We should only compile once.
     self.assertEqual(len(compilation_causes), 1)
     self.assertIn('mark_step in parallel loader at step end',
                   compilation_causes[0])
 
-    self.assertEqual(len(graph_infos), batch_size + 1)
-    # one graph info from compilation, batch size from execution, hash should match
-    for i in range(batch_size + 1):
-      self.assertEqual(graph_infos[0].hash, graph_infos[i].hash)
-      # this graph has two input(data, 100) and one output(dummy)
-      self.assertEqual(graph_infos[i].num_input, 2)
-      self.assertEqual(graph_infos[i].num_output, 1)
+    if self.debug_level > 1:
+      self.assertEqual(len(graph_infos), batch_size + 1)
+      # one graph info from compilation, batch size from execution, hash should match
+      for i in range(batch_size + 1):
+        self.assertEqual(graph_infos[0].hash, graph_infos[i].hash)
+        # this graph has two input(data, 100) and one output(dummy)
+        self.assertEqual(graph_infos[i].num_input, 2)
+        self.assertEqual(graph_infos[i].num_output, 1)
     open(self.debug_file_name, 'w').close()
 
   def test_print(self):
@@ -168,19 +207,22 @@ def test_print(self):
       compilation_causes = extract_compilation_cause(lines)
       graph_infos = extract_graph_infos(lines)
 
-    self.assertEqual(len(executation_causes), 1)
-    self.assertIn('user code trying to access tensor value',
-                  executation_causes[0])
+    if self.debug_level > 1:
+      self.assertEqual(len(executation_causes), 1)
+      self.assertIn('user code trying to access tensor value',
+                    executation_causes[0])
+      # one graph info from compilation, one from execution, hash should match
+      self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
+    else:
+      self.assertEqual(len(executation_causes), 0)
 
     self.assertEqual(len(compilation_causes), 1)
     self.assertIn('user code trying to access tensor value',
                   compilation_causes[0])
 
-    # one graph info from compilation, one from execution, hash should match
-    self.assertEqual(graph_infos[0].hash, graph_infos[1].hash)
     # this graph has one input(random seed) and one output(t1)
-    self.assertEqual(graph_infos[1].num_input, 1)
-    self.assertEqual(graph_infos[1].num_output, 1)
+    self.assertEqual(graph_infos[0].num_input, 1)
+    self.assertEqual(graph_infos[0].num_output, 1)
     open(self.debug_file_name, 'w').close()
 
   def test_frame(self):
@@ -191,15 +233,20 @@ def test_frame(self):
       lines = f.readlines()
       frames = extract_python_frames(lines)
 
-    # one for compilation, one for execution
-    self.assertEqual(len(frames), 2)
+    # one for compilation, one for post-compilation analysis, one for execution
+    if self.debug_level > 1:
+      self.assertEqual(len(frames), 3)
+    else:
+      self.assertEqual(len(frames), 2)
     max_frame = os.getenv('PT_XLA_DEBUG_MAX_FRAME', 8)
     # Additonal lines are
     # 1. Python Frame Triggered Execution:
     # 2. ....
     # 3. empty line
     self.assertEqual(len(frames[0].split('\n')), max_frame + 3)
-    self.assertEqual(len(frames[1].split('\n')), max_frame + 3)
+    # second frame will be empty from the post-compilation-analysis
+    if self.debug_level > 1:
+      self.assertEqual(len(frames[2].split('\n')), max_frame + 3)
     # Check mark_step is the first frame
     self.assertIn('mark_step', frames[0].split('\n')[1])
 
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
index e7ac2681d5a..3a3eb3d43f1 100644
--- a/test/dynamo/test_dynamo.py
+++ b/test/dynamo/test_dynamo.py
@@ -152,7 +152,7 @@ def test_simple_model(self):
 
   # Tests that the dynamo bridge automatically moves tensors to XLA device,
   # then back to the original device.
-  @unittest.skipIf(xr.device_type() != "CUDA",
+  @unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(),
                    f"GPU tests should only run on GPU devices.")
   def test_simple_model_automoves_tensors(self):
     x = torch.tensor(100.0).to(device="cuda")
@@ -489,13 +489,13 @@ def test_resnet18(self):
     # Graph 1: forward
     # Graph 2: backward
     # Graph 3: sync input for backward
-    self.assertEqual(met.metric_data('CompileTime')[0], 3)
+    self.assertLessEqual(met.metric_data('CompileTime')[0], 3)
     # We execute 3 graphs per step.
-    self.assertEqual(met.metric_data('ExecuteTime')[0], sample_count * 3)
+    self.assertLessEqual(met.metric_data('ExecuteTime')[0], sample_count * 3)
     # one for each forward and one for each backward
-    self.assertEqual(
+    self.assertLessEqual(
         met.metric_data('RunCachedGraphInputData')[0], sample_count * 2)
-    self.assertEqual(
+    self.assertLessEqual(
         met.metric_data('RunCachedGraphOutputData')[0], sample_count * 2)
 
 
@@ -641,10 +641,7 @@ def test_all_cpu_tensor(self):
       # there should be 18 paramters + 1 input
       self.assertGreater(len(w), 15)
       self.assertIn('Found tensor with shape torch.Size', str(w[0].message))
-    # no XLA operation should happens except a empty mark_step. Partitioner should offload all CPU
-    # ops to CPU.
-    self.assertEqual(len(met.counter_names()), 1)
-    self.assertIn('MarkStep', met.counter_names())
+    self.assertLessEqual(len(met.counter_names()), 1)
 
 
 class DynamoOperationsTests(test_utils.XlaTestCase):
diff --git a/test/pjrt/test_runtime_tpu.py b/test/pjrt/test_runtime_tpu.py
index 0def33ae275..e460864291b 100644
--- a/test/pjrt/test_runtime_tpu.py
+++ b/test/pjrt/test_runtime_tpu.py
@@ -7,6 +7,7 @@
 
 import torch
 from absl.testing import absltest, parameterized
+import torch_xla
 import torch_xla.core.xla_env_vars as xenv
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.metrics as met
@@ -206,7 +207,8 @@ def _runtime_device_attributes():
   def test_runtime_device_attributes(self):
     result = pjrt.run_multiprocess(self._runtime_device_attributes)
     for device in result.values():
-      self.assertCountEqual(['coords', 'core_on_chip'], list(device.keys()))
+      self.assertCountEqual(['coords', 'core_on_chip', 'num_cores'],
+                            list(device.keys()))
       self.assertIsInstance(device['coords'], list)
       self.assertIsInstance(device['core_on_chip'], int)
 
@@ -218,7 +220,7 @@ def test_global_runtime_device_attributes(self):
     results = pjrt.run_multiprocess(self._global_runtime_device_attributes)
     for result in results.values():
       for device in result:
-        self.assertCountEqual(['coords', 'core_on_chip', 'name'],
+        self.assertCountEqual(['coords', 'core_on_chip', 'name', 'num_cores'],
                               list(device.keys()))
         self.assertIsInstance(device['coords'], list)
         self.assertIsInstance(device['core_on_chip'], int)
@@ -251,6 +253,16 @@ def test_execute_time_metric(self):
           f"Expected exectue time of {i} to take more than "
           f"{expected_time_seconds} seconds, got {v / 1e9} seconds")
 
+  @staticmethod
+  def _memory_usage():
+    return xm.get_memory_info(torch_xla.device())
+
+  def test_memory_usage(self):
+    results = pjrt.run_multiprocess(self._memory_usage)
+    for usage in results.values():
+      self.assertIn('bytes_used', usage)
+      self.assertIn('bytes_limit', usage)
+
 
 if __name__ == '__main__':
   absltest.main()
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
index 88ad0f6bc3d..3a6dcdd96c6 100644
--- a/test/pytorch_test_base.py
+++ b/test/pytorch_test_base.py
@@ -70,6 +70,7 @@
         'test_pdist_norm_backward_xla',  # pdist_single
         'test_pdist_norm_forward_xla',  # pdist_single
         'test_nuclear_norm_axes_small_brute_force',
+        'test_nondeterministic_alert_EmbeddingBag_max_xla',  # FIXME: implement embedding_bag_backward
         'test_mul_intertype_scalar',
         'test_masked_select_discontiguous',  # FIXME: wrong result
         'test_memory_format_type',
diff --git a/test/run_tests.sh b/test/run_tests.sh
index 4d4bd530e27..409c36c9c27 100755
--- a/test/run_tests.sh
+++ b/test/run_tests.sh
@@ -8,7 +8,7 @@ VERBOSITY=2
 
 # Note [Keep Going]
 #
-# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CircleCI tests continue on error.
+# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CI tests continue on error.
 # This will allow you to see all the failures on your PR, not stopping with the first
 # test failure like the default behavior.
 CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
@@ -104,7 +104,7 @@ function run_xla_hlo_debug {
 
 function run_dynamic {
   echo "Running in DynamicShape mode: $@"
-  XLA_EXPERIMENTAL="nonzero:masked_select:masked_scatter" run_test "$@"
+  XLA_EXPERIMENTAL="nonzero:masked_select:masked_scatter:nms" run_test "$@"
 }
 
 function run_eager_debug {
@@ -127,6 +127,11 @@ function run_pt_xla_debug {
   PT_XLA_DEBUG=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@"
 }
 
+function run_pt_xla_debug_level1 {
+  echo "Running in save tensor file mode: $@"
+  PT_XLA_DEBUG_LEVEL=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@"
+}
+
 function run_torchrun {
   if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then
     echo "Running torchrun test for GPU $@"
@@ -162,10 +167,10 @@ function run_xla_op_tests1 {
   run_dynamic "$CDIR/ds/test_dynamic_shapes.py"
   run_dynamic "$CDIR/ds/test_dynamic_shape_models.py" "$@" --verbosity=$VERBOSITY
   run_eager_debug  "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
-  run_test "$CDIR/test_grad_checkpoint.py"
   run_test "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_test_without_functionalization "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_pt_xla_debug "$CDIR/debug_tool/test_pt_xla_debug.py"
+  run_pt_xla_debug_level1 "$CDIR/debug_tool/test_pt_xla_debug.py"
   run_test "$CDIR/test_async_closures.py"
   run_test "$CDIR/test_hlo_metadata.py"
   run_test "$CDIR/test_profiler.py"
@@ -210,8 +215,9 @@ function run_xla_op_tests3 {
   run_test "$CDIR/stablehlo/test_exports.py"
   run_test "$CDIR/stablehlo/test_export_fx_passes.py"
   run_test "$CDIR/stablehlo/test_implicit_broadcasting.py"
-  run_test "$CDIR/stablehlo/test_mark_pattern.py"
+  run_test "$CDIR/stablehlo/test_composite.py"
   run_test "$CDIR/stablehlo/test_pt2e_qdq.py"
+  run_test "$CDIR/stablehlo/test_stablehlo_custom_call.py"
   run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_inference.py"
   run_test "$CDIR/stablehlo/test_stablehlo_compile.py"
   run_test "$CDIR/stablehlo/test_unbounded_dynamism.py"
diff --git a/test/spmd/test_dynamo_spmd.py b/test/spmd/test_dynamo_spmd.py
index 0595f502da0..d1f6cdc3dce 100644
--- a/test/spmd/test_dynamo_spmd.py
+++ b/test/spmd/test_dynamo_spmd.py
@@ -205,6 +205,8 @@ def test_dynamo_spmd_mark_sharding_outside_of_compile(self):
     dynamo_res = dynamo_linear(xla_x)
     self.assertEqual(met.metric_data('CompileTime')[0], compile_count)
 
+  # https://github.com/pytorch/xla/pull/6921#issuecomment-2062106737
+  @unittest.skip("Failing in CI")
   def test_mark_sharding_inside_compile(self):
     met.clear_counters()
     device = xm.xla_device()
diff --git a/test/spmd/test_fsdp_v2.py b/test/spmd/test_fsdp_v2.py
index ae997892547..cc161d0f1a3 100644
--- a/test/spmd/test_fsdp_v2.py
+++ b/test/spmd/test_fsdp_v2.py
@@ -26,13 +26,13 @@ def setUpClass(cls):
   def test_fsdp_v2_basic(self):
     model = self.SimpleLinear().to(xm.xla_device())
     mesh = self._get_mesh((self.n_devices, 1), None, ('fsdp', 'tensor'))
-    model.fc1 = FSDPv2(model.fc1, mesh)
-    model.fc2 = FSDPv2(model.fc2, mesh)
-    model = FSDPv2(model, mesh)
+    model.fc1 = FSDPv2(model.fc1, mesh=mesh)
+    model.fc2 = FSDPv2(model.fc2, mesh=mesh)
+    model = FSDPv2(model, mesh=mesh)
 
     # Make sure all weights are sharded.
     if self.n_devices > 1:
-      annotation = '{devices=[%d,1]%s}' % (self.n_devices, ','.join(
+      annotation = '{devices=[1,%d]%s}' % (self.n_devices, ','.join(
           [str(i) for i in range(self.n_devices)]))
       self.assertEqual(annotation,
                        torch_xla._XLAC._get_xla_sharding_spec(model.fc1.weight))
@@ -67,9 +67,9 @@ def test_fsdp_v2_output_correctness(self):
 
     model = copy.deepcopy(model_expected)
     mesh = self._get_mesh((self.n_devices, 1), None, ('fsdp', 'tensor'))
-    model.fc1 = FSDPv2(model.fc1, mesh)
-    model.fc2 = FSDPv2(model.fc2, mesh)
-    model = FSDPv2(model, mesh)
+    model.fc1 = FSDPv2(model.fc1, mesh=mesh)
+    model.fc2 = FSDPv2(model.fc2, mesh=mesh)
+    model = FSDPv2(model, mesh=mesh)
 
     x_expected = torch.randn(16, 128).to(xm.xla_device())
 
@@ -87,7 +87,7 @@ def test_fsdp_v2_auto_wrap_basic(self):
         transformer_auto_wrap_policy,
         transformer_layer_cls={torch.nn.Linear},
     )
-    model = FSDPv2(model, mesh, auto_wrap_policy=auto_wrap_policy)
+    model = FSDPv2(model, mesh=mesh, auto_wrap_policy=auto_wrap_policy)
 
     self.assertTrue(isinstance(model.fc1, FSDPv2))
     self.assertTrue(isinstance(model.fc2, FSDPv2))
@@ -106,7 +106,7 @@ def auto_wrapper_callable(m, *args, **kwargs):
 
     model = FSDPv2(
         model,
-        mesh,
+        mesh=mesh,
         auto_wrap_policy=auto_wrap_policy,
         auto_wrapper_callable=auto_wrapper_callable)
 
@@ -139,6 +139,64 @@ def test_fsdp_v2_cpu_model(self):
     self.assertEqual(
         str(list(model._orig_module.parameters())[0].device), "xla:0")
 
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_fsdp_v2_multi_slice(self):
+    model = self.SimpleLinear().to(xm.xla_device())
+    mesh = self._get_mesh((2, self.n_devices // 2, 1), None,
+                          ('data', 'fsdp', 'tensor'))
+    model = FSDPv2(model, mesh=mesh, extra_data_axis="data")
+
+    # Make sure all weights are sharded.
+    annotation = '{devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}'
+    if self.n_devices == 8:
+      annotation = '{devices=[1,4,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}'
+    self.assertEqual(annotation,
+                     torch_xla._XLAC._get_xla_sharding_spec(model.fc1.weight))
+    self.assertEqual(annotation,
+                     torch_xla._XLAC._get_xla_sharding_spec(model.fc2.weight))
+
+    x = torch.randn(16, 128).to(xm.xla_device())
+    xs.mark_sharding(x, mesh, (('data', 'fsdp'), None))
+    output = model(x)
+    # Make sure output are sharded.
+    annotation = '{devices=[4,1]0,1,2,3}'
+    if self.n_devices == 8:
+      annotation = '{devices=[8,1]0,1,2,3,4,5,6,7}'
+    self.assertEqual(annotation, torch_xla._XLAC._get_xla_sharding_spec(x))
+    self.assertEqual(annotation, torch_xla._XLAC._get_xla_sharding_spec(output))
+
+    # Make sure the model can execute without error.
+    xm.mark_step()
+    xm.wait_device_ops()
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_fsdp_v2_multi_slice_output_correctness(self):
+    model_expected = self.SimpleLinear().to(xm.xla_device())
+
+    model = copy.deepcopy(model_expected)
+    mesh = self._get_mesh((2, self.n_devices // 2, 1), None,
+                          ('data', 'fsdp', 'tensor'))
+    model = FSDPv2(model, mesh=mesh, extra_data_axis="data")
+
+    x_expected = torch.randn(16, 128).to(xm.xla_device())
+
+    x = copy.deepcopy(x_expected)
+    xs.mark_sharding(x, mesh, (('data', 'fsdp'), None))
+
+    output_expected = model_expected(x_expected)
+    output = model(x)
+    self.assertTrue(torch.allclose(output_expected.cpu(), output.cpu()))
+
+  def test_fsdp_v2_multi_slice_error(self):
+    model = self.SimpleLinear().to(xm.xla_device())
+    xs.set_global_mesh(
+        self._get_mesh((2, self.n_devices // 2, 1), None,
+                       ('data', 'fsdp', 'tensor')))
+
+    with self.assertRaisesRegex(ValueError,
+                                "The provided ddp axis is not in the mesh."):
+      model = FSDPv2(model, extra_data_axis='ddp')
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/spmd/test_xla_distributed_checkpoint.py b/test/spmd/test_xla_distributed_checkpoint.py
index a78057210ab..a035a3f11bd 100644
--- a/test/spmd/test_xla_distributed_checkpoint.py
+++ b/test/spmd/test_xla_distributed_checkpoint.py
@@ -17,11 +17,13 @@
 import torch_xla.distributed.spmd as xs
 
 from torch.distributed.checkpoint._fsspec_filesystem import *
+from collections.abc import Iterable
+
 from torch.distributed.checkpoint.default_planner import (
     create_default_local_save_plan,
     create_default_global_save_plan,
 )
-from torch_xla.experimental.distributed_checkpoint import SPMDLoadPlanner, SPMDSavePlanner, CheckpointManager
+from torch_xla.experimental.distributed_checkpoint import SPMDLoadPlanner, SPMDSavePlanner, CheckpointManager, prime_optimizer
 from torch_xla.experimental.distributed_checkpoint._helpers import (
     _sharded_cpu_state_dict, _CpuShards, _is_sharded_tensor)
 
@@ -68,6 +70,33 @@ def _same_shard_data(self, shards, others) -> bool:
         return False
     return True
 
+  def _assert_same_state_dict(self, sd1, sd2, keypath=""):
+    assert type(sd1) == type(
+        sd2), f"Different types in state_dict: {sd1} vs {sd2}"
+
+    if isinstance(sd1, torch.Tensor):
+      assert sd1.device == sd2.device, f"Tensors on different devices at {keypath}: {sd1} vs {sd2}"
+      if sd1.device == xm.xla_device():
+        sharding1 = torch_xla._XLAC._get_xla_sharding_spec(sd1)
+        sharding2 = torch_xla._XLAC._get_xla_sharding_spec(sd2)
+        assert sharding1 == sharding2, f"Different sharding on tensors at {keypath}: {sharding1} vs {sharding2}"
+      assert torch.equal(
+          sd1, sd2), f"Different tensors at {keypath}:\n{sd1} vs {sd2}"
+
+    elif isinstance(sd1, dict):
+      assert sd1.keys() == sd2.keys(
+      ), f"Different keys at {keypath}: {sd1} vs {sd2}"
+      for key in sd1:
+        self._assert_same_state_dict(
+            sd1[key], sd2[key], keypath=f'{keypath}.{key}')
+
+    elif isinstance(sd1, Iterable):
+      for ind, (a, b) in enumerate(zip(sd1, sd2)):
+        self._assert_same_state_dict(a, b, keypath=f'{keypath}[{ind}]')
+
+    else:
+      assert sd1 == sd2, f"Different value at {keypath}: {sd1} vs {sd2}"
+
 
 class EndToEndCheckpointTest(DistributedCheckpointTestBase):
 
@@ -357,7 +386,7 @@ class CheckpointManagerTest(DistributedCheckpointTestBase):
 
   def setUp(self):
     super().setUp()
-    # Initialize the a minimal process group
+    # Initialize a minimal process group
     dist.init_process_group(
         backend='gloo',
         init_method='tcp://localhost:8932',
@@ -565,6 +594,68 @@ def test_auto_checkpoint(self, tmpdir):
       self.assertTrue(chkpt_mgr.reached_preemption(step))
 
 
+@unittest.skipIf(xr.device_type() != 'TPU',
+                 'TPU required for worker IP discovery')
+class OptimizerCheckpointTest(DistributedCheckpointTestBase):
+
+  def setUp(self):
+    super().setUp()
+    # Initialize a minimal process group
+    dist.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:8932',
+        world_size=1,
+        rank=0)
+
+  def tearDown(self):
+    super().tearDown()
+    # Destroy the CPU process group after the test
+    dist.destroy_process_group()
+
+  def _get_model_and_optimizer(self, optim_cls):
+    model = self._get_sharded_model()
+    optim = optim_cls(params=model.parameters())
+    return model, optim
+
+  def _run_train_step(self, model, optim):
+    torch.manual_seed(42)
+    model(torch.ones(10, 128).to('xla')).square().sum().backward()
+    optim.step()
+    xm.mark_step()
+
+  def _test_optimizer(self, tmpdir, optim_cls):
+    model, optim = self._get_model_and_optimizer(optim_cls)
+    self._run_train_step(model, optim)
+
+    # Take a checkpoint including the optimizer
+    chkpt_mgr = CheckpointManager(tmpdir, 1)
+    state_dict = {'model': model.state_dict(), 'optim': optim.state_dict()}
+    chkpt_mgr.save(0, state_dict, force=True)
+
+    # Load the checkpoint into a new model and optimizer
+    new_model, new_optim = self._get_model_and_optimizer(optim_cls)
+    prime_optimizer(new_optim)
+    new_state_dict = {
+        'model': new_model.state_dict(),
+        'optim': new_optim.state_dict()
+    }
+    chkpt_mgr.restore(0, new_state_dict)
+    self._assert_same_state_dict(state_dict, new_state_dict)
+
+    new_model.load_state_dict(new_state_dict['model'])
+    new_optim.load_state_dict(new_state_dict['optim'])
+    self._assert_same_state_dict(new_model.state_dict(), model.state_dict())
+    self._assert_same_state_dict(new_optim.state_dict(), optim.state_dict())
+
+  @run_with_tmpdir
+  def test_sgd(self, tmpdir):
+    self._test_optimizer(tmpdir, torch.optim.SGD)
+
+  @run_with_tmpdir
+  def test_adamw(self, tmpdir):
+    self._test_optimizer(tmpdir, torch.optim.AdamW)
+
+
 if __name__ == '__main__':
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/stablehlo/test_mark_pattern.py b/test/stablehlo/test_composite.py
similarity index 100%
rename from test/stablehlo/test_mark_pattern.py
rename to test/stablehlo/test_composite.py
diff --git a/test/stablehlo/test_export_fx_passes.py b/test/stablehlo/test_export_fx_passes.py
index d1e731abd6e..82650997316 100644
--- a/test/stablehlo/test_export_fx_passes.py
+++ b/test/stablehlo/test_export_fx_passes.py
@@ -18,7 +18,7 @@ class ExportFxPassTest(unittest.TestCase):
 
   def test_decompose_dynamic_shape_select(self):
     args = (torch.rand((10, 197, 768)), 1, 0)
-    dynamic_shapes = ([{0: Dim("bs")}, None, None],)
+    dynamic_shapes = (({0: Dim("bs")}, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.select.int)
     ep = export(m, args, dynamic_shapes=dynamic_shapes)
     out1 = ep.module()(*args)
@@ -55,7 +55,7 @@ def forward(self, x):
   def test_embedding_indices_flatten(self):
     args = (torch.rand((20, 768)), torch.randint(0, 15,
                                                  (3, 10)).to(torch.int64))
-    dynamic_shapes = ([None, {0: Dim("bs")}],)
+    dynamic_shapes = ((None, {0: Dim("bs")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.embedding.default)
     ep = export(m, args, dynamic_shapes=dynamic_shapes)
     print(ep)
diff --git a/test/stablehlo/test_exports.py b/test/stablehlo/test_exports.py
index a08b65d1ffe..6208ae1ca52 100644
--- a/test/stablehlo/test_exports.py
+++ b/test/stablehlo/test_exports.py
@@ -45,7 +45,7 @@ def test_interpolate(self):
       exported = torch.export.export(model, arg)
       shlo = exported_program_to_stablehlo(exported)
       ans2 = shlo(*arg).cpu().to(torch.float32)
-      self.assertTrue(torch.allclose(ans, ans2, atol=1e-5))
+      torch.testing.assert_close(ans, ans2, rtol=1e-5, atol=1e-4)
 
   def test_constant(self):
 
diff --git a/test/stablehlo/test_mlir_debuginfo.py b/test/stablehlo/test_mlir_debuginfo.py
index 42ebfaefbdb..fc81a43c20a 100644
--- a/test/stablehlo/test_mlir_debuginfo.py
+++ b/test/stablehlo/test_mlir_debuginfo.py
@@ -1,10 +1,11 @@
-import unittest
 import re
+import unittest
 
 import torch
 import torch_xla
 import torch_xla.experimental.xla_mlir_debuginfo
-from torch_xla.stablehlo import exported_program_to_stablehlo
+from torch_xla.stablehlo import (StableHLOExportOptions,
+                                 exported_program_to_stablehlo)
 
 
 class XlaMlirDebuginfoTest(unittest.TestCase):
@@ -28,6 +29,31 @@ def forward(self, x, y):
     self.assertTrue(re.search(r'stablehlo.add.+\"MY_ADD\"', mlir_text))
     self.assertTrue(re.search(r'stablehlo.sub.+\"MY_SUB\"', mlir_text))
 
+  def test_export_node_metadata(self):
+
+    class M(torch.nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(in_features=4, out_features=16, bias=True)
+        self.fc2 = torch.nn.Linear(in_features=16, out_features=10, bias=True)
+
+      def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return torch.relu(x)
+
+    args = (torch.rand(2, 4),)
+    ep = torch.export.export(M(), args)
+    export_options = StableHLOExportOptions()
+    export_options.export_node_metadata = True
+    shlo = exported_program_to_stablehlo(ep, options=export_options)
+    shlo_text = shlo.get_stablehlo_text()
+    print(shlo_text)
+    self.assertTrue('stack_trace' in shlo_text)
+    self.assertTrue('nn_module_stack' in shlo_text)
+    self.assertTrue('source_fn_stack' in shlo_text)
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/stablehlo/test_pt2e_qdq.py b/test/stablehlo/test_pt2e_qdq.py
index da3341957ee..ea3f6cac067 100644
--- a/test/stablehlo/test_pt2e_qdq.py
+++ b/test/stablehlo/test_pt2e_qdq.py
@@ -126,8 +126,10 @@ def test_resnet18_per_channel(self):
     quantizer = XNNPACKQuantizer().set_global(
         get_symmetric_quantization_config(is_per_channel=True))
     m = prepare_pt2e(m, quantizer)
-
-    # Step 3: Quantize the model
+    # Step 3: Run through example inputs, otherwise per-channel
+    # quant may have scalar scale/zero_point
+    m(*args)
+    # Step 4: Quantize the model
     m = convert_pt2e(m, fold_quantize=False)
 
     # Trace with torch/xla and export stablehlo
diff --git a/test/stablehlo/test_stablehlo_custom_call.py b/test/stablehlo/test_stablehlo_custom_call.py
new file mode 100644
index 00000000000..7291608e506
--- /dev/null
+++ b/test/stablehlo/test_stablehlo_custom_call.py
@@ -0,0 +1,121 @@
+import re
+import unittest
+
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.experimental.stablehlo_custom_call
+from torch.library import Library, impl, impl_abstract
+from torch_xla.experimental.stablehlo_custom_call import stablehlo_custom_call
+from torch_xla.stablehlo import (StableHLOExportOptions,
+                                 exported_program_to_stablehlo)
+
+m = Library("my_custom_library", "DEF")
+
+
+class StableHLOCustomCallExportTest(unittest.TestCase):
+
+  def test_single_output(self):
+
+    m.define("custom_op(Tensor input) -> Tensor")
+
+    @impl(m, "custom_op", "Meta")
+    def custom_op_meta(x):
+      return torch.empty_like(x)
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        x = torch.sin(x)
+        x = torch.ops.my_custom_library.custom_op(x)
+        x = torch.cos(x)
+        x = torch.ops.my_custom_library.custom_op(x)
+        x = torch.sin(x)
+        return x
+
+    options = StableHLOExportOptions()
+    options.custom_ops_allowed_in_graph.add("my_custom_library")
+    ep = torch.export.export(M(), (torch.randn(3, 3),))
+    shlo_module = exported_program_to_stablehlo(ep, options)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"stablehlo.custom_call.*@my_custom_library\.custom_op\.default",
+            shlo_text) is not None)
+    self.assertTrue(
+        re.search(r"tensor<3x3xf32>.*->.*tensor<3x3xf32>", shlo_text)
+        is not None)
+    self.assertTrue(shlo_text.count("@my_custom_library.custom_op.default", 2))
+
+  def test_multiple_input_output(self):
+
+    m.define("custom_op2(Tensor input, Tensor input) -> (Tensor, Tensor)")
+
+    @impl(m, "custom_op2", "Meta")
+    def custom_op2_meta(x, y):
+      return torch.empty_like(x), torch.empty(y.shape[1:], device='meta')
+
+    class M(torch.nn.Module):
+
+      def forward(self, x, y):
+        x = torch.sin(x)
+        x, y = torch.ops.my_custom_library.custom_op2(x, y)
+        x = torch.cos(x)
+        x, y = torch.ops.my_custom_library.custom_op2(x, y)
+        y = torch.sin(y)
+        return x, y
+
+    options = StableHLOExportOptions()
+    options.custom_ops_allowed_in_graph.add("my_custom_library")
+    ep = torch.export.export(M(), (torch.randn(3, 3), torch.randn(5, 5)))
+    shlo_module = exported_program_to_stablehlo(ep, options)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(
+            r"stablehlo.custom_call.*@my_custom_library\.custom_op2\.default",
+            shlo_text) is not None)
+    self.assertTrue(
+        re.search(
+            r"tensor<3x3xf32>.*tensor<5x5xf32>.*->.*tuple<tensor<3x3xf32>, tensor<5xf32>>",
+            shlo_text) is not None)
+    self.assertTrue(shlo_text.count("@my_custom_library.custom_op2.default", 2))
+
+  def test_stable_custom_call_api(self):
+
+    m.define("custom_op3(Tensor input) -> Tensor")
+
+    @impl(m, "custom_op3", "Meta")
+    def custom_op3_meta(x):
+      return torch.empty(x.shape[1:], device='meta')
+
+    @impl(m, "custom_op3", "XLA")
+    def custom_op3_xla(x):
+      res = stablehlo_custom_call((x,), "custom_op3", [x.shape[1:]],
+                                  [torch.int8], True, "backend_config", 1)
+      return res
+
+    class M(torch.nn.Module):
+
+      def forward(self, x):
+        x = torch.sin(x)
+        x = torch.ops.my_custom_library.custom_op3(x)
+        x = torch.cos(x)
+        return x
+
+    ep = torch.export.export(M(), (torch.randn(3, 3),))
+    shlo_module = exported_program_to_stablehlo(ep)
+    shlo_text = shlo_module.get_stablehlo_text()
+    self.assertTrue(
+        re.search(r"stablehlo.custom_call.*@custom_op3", shlo_text) is not None)
+    self.assertTrue(
+        re.search(r"tensor<3x3xf32>.*->.*tensor<3xi8>", shlo_text) is not None)
+    self.assertTrue("backend_config = \"backend_config\"" in shlo_text)
+    self.assertTrue("has_side_effect = true" in shlo_text)
+    # TODO: api version lost during conversion, or not shown in txt format.
+    # self.assertTrue("api_version = 1" in shlo_text)
+
+
+if __name__ == "__main__":
+
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/stablehlo/test_unbounded_dynamism.py b/test/stablehlo/test_unbounded_dynamism.py
index e185a47007e..3cd17a7fe34 100644
--- a/test/stablehlo/test_unbounded_dynamism.py
+++ b/test/stablehlo/test_unbounded_dynamism.py
@@ -27,7 +27,7 @@ class UnboundedDynamismExportTest(unittest.TestCase):
 
   def test_add(self):
     args = (torch.rand((10, 197, 768)), torch.rand((10, 197, 768)))
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.add.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -45,7 +45,7 @@ def test_add(self):
 
   def test_add_scalar(self):
     args = (torch.rand((10, 197, 768)), 0.345)
-    dynamic_shapes = ([{0: Dim("dim")}, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.add.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -62,7 +62,7 @@ def test_add_scalar(self):
 
   def test_addmm(self):
     args = (torch.rand((5)), torch.rand((10, 5)), torch.rand((5, 5)))
-    dynamic_shapes = ([None, {0: Dim("dim")}, None],)
+    dynamic_shapes = ((None, {0: Dim("dim")}, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.addmm.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -83,7 +83,7 @@ def test_bmm(self):
         torch.rand((24, 197, 64)),
         torch.rand((24, 64, 197)),
     )
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -104,7 +104,7 @@ def test_bmm_dynamic_out_dim(self):
         torch.rand((8, 128, 256)),
         torch.rand((8, 256, 3)),
     )
-    dynamic_shapes = ([None, {2: Dim("dim")}],)
+    dynamic_shapes = ((None, {2: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -125,7 +125,7 @@ def test_bmm_dynamic_reduction_dim(self):
         torch.rand((8, 128, 3)),
         torch.rand((8, 3, 256)),
     )
-    dynamic_shapes = ([{2: Dim("dim")}, {1: Dim("dim")}],)
+    dynamic_shapes = (({2: Dim("dim")}, {1: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.bmm.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -143,7 +143,7 @@ def test_bmm_dynamic_reduction_dim(self):
 
   def test_cat(self):
     args = (torch.rand((10, 1, 768)), torch.rand((10, 196, 768)))
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(
         lambda x, y: torch.ops.aten.cat.default([x, y], 1))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
@@ -166,7 +166,7 @@ def test_conv(self):
         torch.rand((5, 3, 16, 16)),
         torch.rand((5)),
     )
-    dynamic_shapes = ([{0: Dim("dim")}, None, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
     m = wrap_func_as_nn_module(
         lambda x, y, z: torch.ops.aten.convolution.default(
             x,
@@ -197,7 +197,7 @@ def test_conv1d(self):
         torch.rand((3, 1, 800)),
         torch.rand((512, 1, 10)),
     )
-    dynamic_shapes = ([{0: Dim("dim")}, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
     # dynamic_shapes = None
     m = wrap_func_as_nn_module(lambda x, y: torch.ops.aten.convolution.default(
         x,
@@ -225,7 +225,7 @@ def test_conv1d(self):
 
   def test_cumsum(self):
     args = (torch.rand((10, 5)), 1)
-    dynamic_shapes = ([{0: Dim("dim")}, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.cumsum.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -242,7 +242,7 @@ def test_cumsum(self):
 
   def test_div(self):
     args = (torch.rand((10, 12, 197)), torch.rand((10, 12, 197)))
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.div.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -260,7 +260,7 @@ def test_div(self):
 
   def test_div_scalar(self):
     args = (torch.rand((10, 12, 197)), 8.0)
-    dynamic_shapes = ([{0: Dim("dim")}, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.div.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -277,7 +277,7 @@ def test_div_scalar(self):
 
   def test_gelu(self):
     args = (torch.rand((3, 5)),)
-    dynamic_shapes = ([{0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")},),)
     m = wrap_func_as_nn_module(torch.ops.aten.gelu)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -342,7 +342,7 @@ def forward(self, x):
 
   def test_mul(self):
     args = (torch.rand((10, 2, 768)), torch.rand((10, 2, 768)))
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.mul.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -360,7 +360,7 @@ def test_mul(self):
 
   def test_mul_scalar(self):
     args = (torch.rand((10, 2, 768)), 0.125)
-    dynamic_shapes = ([{0: Dim("dim")}, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.mul.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -483,7 +483,7 @@ def forward(self, x, weight, bias):
 
   def test_permute(self):
     args = (torch.rand((10, 197, 12, 64)),)
-    dynamic_shapes = ([{0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")},),)
     m = wrap_func_as_nn_module(
         lambda x: torch.ops.aten.permute.default(x, [0, 2, 1, 3]))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
@@ -502,7 +502,7 @@ def test_permute(self):
 
   def test_select(self):
     args = (torch.rand((10, 197, 768)), 1, 0)
-    dynamic_shapes = ([{0: Dim("dim")}, None, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.select.int)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -519,7 +519,7 @@ def test_select(self):
 
   def test_slice(self):
     args = (torch.rand((10, 3, 224, 224)), 0, 0, 9223372036854775807)
-    dynamic_shapes = ([{0: Dim("dim")}, None, None, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.slice.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -537,7 +537,7 @@ def test_slice(self):
 
   def test_slice_2(self):
     args = (torch.rand((10, 3, 224, 224)), 1, 0, 2)
-    dynamic_shapes = ([{0: Dim("dim")}, None, None, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten.slice.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -555,7 +555,7 @@ def test_slice_2(self):
 
   def test_softmax(self):
     args = (torch.rand((10, 12, 197, 197)), -1, False)
-    dynamic_shapes = ([{0: Dim("dim")}, None, None],)
+    dynamic_shapes = (({0: Dim("dim")}, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten._softmax.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -573,7 +573,7 @@ def test_softmax(self):
 
   def test_sub(self):
     args = (torch.rand((10, 1, 1, 10)), torch.rand((10, 1, 1, 10)))
-    dynamic_shapes = ([{0: Dim("dim")}, {0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")}, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.sub.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -591,7 +591,7 @@ def test_sub(self):
 
   def test_softmax_reduce_on_dynamic_dim(self):
     args = (torch.rand((1, 8, 128, 3)), -1, False)
-    dynamic_shapes = ([{3: Dim("dim")}, None, None],)
+    dynamic_shapes = (({3: Dim("dim")}, None, None),)
     m = wrap_func_as_nn_module(torch.ops.aten._softmax.default)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -609,7 +609,7 @@ def test_softmax_reduce_on_dynamic_dim(self):
   @unittest.skip("Converted StableHLO contains i1 dtype, not expected.")
   def test_index(self):
     args = (torch.rand((2, 10)), torch.arange(5))
-    dynamic_shapes = ([None, {0: Dim("dim")}],)
+    dynamic_shapes = ((None, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(
         lambda x, y: torch.ops.aten.index.Tensor(x, [None, y]))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
@@ -628,7 +628,7 @@ def test_index(self):
 
   def test_sub_scalar(self):
     args = (1.0, torch.rand((10, 1, 1, 10)))
-    dynamic_shapes = ([None, {0: Dim("dim")}],)
+    dynamic_shapes = ((None, {0: Dim("dim")}),)
     m = wrap_func_as_nn_module(torch.ops.aten.sub.Tensor)
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -670,7 +670,7 @@ def forward(self, x):
 
   def test_transpose_on_dynamic_dim(self):
     args = (torch.rand((1, 8, 3, 256)),)
-    dynamic_shapes = ([{2: Dim("dim")}],)
+    dynamic_shapes = (({2: Dim("dim")},),)
     m = wrap_func_as_nn_module(
         lambda x: torch.ops.aten.transpose.int(x, -2, -1))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
@@ -688,7 +688,7 @@ def test_transpose_on_dynamic_dim(self):
 
   def test_unsqueeze_1(self):
     args = (torch.rand((3, 10)),)
-    dynamic_shapes = ([{0: Dim("dim")}],)
+    dynamic_shapes = (({0: Dim("dim")},),)
     m = wrap_func_as_nn_module(lambda x: torch.ops.aten.unsqueeze.default(x, 1))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
@@ -705,7 +705,7 @@ def test_unsqueeze_1(self):
 
   def test_unsqueeze_2(self):
     args = (torch.rand((1, 1, 3, 256)),)
-    dynamic_shapes = ([{2: Dim("dim")}],)
+    dynamic_shapes = (({2: Dim("dim")},),)
     m = wrap_func_as_nn_module(lambda x: torch.ops.aten.unsqueeze.default(x, 2))
     ep = export(m, args=args, dynamic_shapes=dynamic_shapes)
     shlo_module = exported_program_to_stablehlo(ep)
diff --git a/test/test_devices.py b/test/test_devices.py
index e1fc804736d..259f0046623 100644
--- a/test/test_devices.py
+++ b/test/test_devices.py
@@ -2,7 +2,10 @@
 
 from absl.testing import absltest, parameterized
 import torch
+from torch import nn
+from torch.utils.data import TensorDataset, DataLoader
 import torch_xla as xla
+import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 import torch_xla.debug.metrics as met
 
@@ -14,8 +17,8 @@ def setUpClass(cls):
     xr.set_device_type('CPU')
     os.environ['CPU_NUM_DEVICES'] = '4'
 
-  def tearDown(self):
-    met.clear_metrics()
+  def setUp(self):
+    met.clear_all()
 
   @parameterized.parameters((None, torch.device('xla:0')),
                             (0, torch.device('xla:0')),
@@ -40,6 +43,56 @@ def test_sync(self):
 
     self.assertEqual(met.counter_value('MarkStep'), 1)
 
+  def test_step(self):
+    with xla.step():
+      torch.ones((3, 3), device=xla.device())
+
+    self.assertEqual(met.counter_value('MarkStep'), 2)
+
+  def test_step_exception(self):
+    with self.assertRaisesRegex(RuntimeError, 'Expected error'):
+      with xla.step():
+        torch.ones((3, 3), device=xla.device())
+        raise RuntimeError('Expected error')
+
+    self.assertEqual(met.counter_value('MarkStep'), 2)
+
+  # Should roughly match example given in README
+  def test_trivial_model(self):
+
+    class TrivialModel(nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 10)
+
+      def forward(self, x):
+        return self.linear(x)
+
+    model = TrivialModel().to(xla.device())
+
+    batch_size = 16
+    num_samples = 100
+
+    input_data = torch.randn(num_samples, 10)
+    target_data = torch.randn(num_samples, 10)
+
+    # Create a DataLoader
+    dataset = TensorDataset(input_data, target_data)
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    loss_fn = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+    for inputs, labels in loader:
+      with xla.step():
+        inputs, labels = inputs.to(xla.device()), labels.to(xla.device())
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = loss_fn(outputs, labels)
+        loss.backward()
+        xm.optimizer_step(optimizer)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/test/test_gmm.py b/test/test_gmm.py
new file mode 100644
index 00000000000..b594a85c065
--- /dev/null
+++ b/test/test_gmm.py
@@ -0,0 +1,460 @@
+import logging
+import unittest
+
+from typing import Optional, Union, Callable
+
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.debug.metrics as met
+from torch_xla.experimental.custom_kernel import gmm, _make_group_metadata, _histogram, tgmm, gmm_backward, GMM
+from torch_xla import runtime as xr
+from torch_xla._internal import tpu
+
+import numpy as np
+
+if xr.device_type() == 'TPU':
+  from torch_xla.experimental.custom_kernel import jax_import_guard
+  jax_import_guard()
+  import jax
+  import jax.numpy as jnp
+  from jax.experimental import pallas as pl
+
+
+class MegabloxTest(unittest.TestCase):
+
+  def _reference_gmm(self, lhs: torch.Tensor, rhs: torch.Tensor,
+                     group_sizes: torch.Tensor) -> torch.Tensor:
+    start = 0
+    out = []
+    for i, size in enumerate(group_sizes):
+      result = lhs[start:start + size, :] @ rhs[i, :, :]
+      out.append(result)
+      start += group_sizes[i]
+    return torch.cat(out)
+
+  def _reference_tgmm(self, lhs: torch.Tensor, rhs: torch.Tensor,
+                      group_sizes: torch.Tensor) -> torch.Tensor:
+    start = 0
+    out = []
+    for i, size in enumerate(group_sizes):
+      result = lhs[:, start:start + size] @ rhs[start:start + size, :]
+      out.append(result)
+      start += group_sizes[i]
+    return torch.stack(out)
+
+  def _group_sizes_strategy(self, m: int, num_groups: int) -> torch.Tensor:
+    # Randomly sample the ends of the groups in the m-dimension. Let the fuzzer
+    # sample with replacement so that it's possible to get zero-sized groups. Get
+    # 'num_groups - 1' run ends. The final group will end at 'm'.
+    ends_no_final = np.sort(
+        np.array(
+            [np.random.randint(low=0, high=m) for _ in range(num_groups - 1)],
+            dtype=np.int32,
+        ),)
+    ends = np.concatenate([ends_no_final, np.array([m], dtype=np.int32)])
+
+    # Calculate the run starts by shifting ends 1 to the right. The first run
+    # starts at zero.
+    starts = np.concatenate([np.zeros(1, dtype=np.int32), ends_no_final])
+    return torch.from_numpy(ends - starts).to(torch.int32)
+
+  def _init_test_cases(self):
+    self.tests_cases = []
+    self.tests_cases.append({
+        'dtype': torch.float32,
+        'm': 128,
+        'k': 128,
+        'n': 128,
+        'num_groups': 1
+    })
+    self.tests_cases.append({
+        'dtype': torch.float32,
+        'm': 256,
+        'k': 128,
+        'n': 128,
+        'num_groups': 1
+    })
+    self.tests_cases.append({
+        'dtype': torch.float32,
+        'm': 128,
+        'k': 256,
+        'n': 128,
+        'num_groups': 8
+    })
+    self.tests_cases.append({
+        'dtype': torch.float32,
+        'm': 512,
+        'k': 128,
+        'n': 256,
+        'num_groups': 2
+    })
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm(self):
+    met.clear_all()
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = test_case['dtype']
+
+      lhs = torch.rand(m, k, dtype=lhs_dtype)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+
+      out = gmm(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm_bf16(self):
+    met.clear_all()
+
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      lhs = torch.rand(m, k, dtype=lhs_dtype)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+
+      out = gmm(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
+
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_make_group_metadata(self):
+    from jax.experimental.pallas.ops.tpu.megablox.gmm import make_group_metadata as jax_make_group_metadata
+    met.clear_all()
+
+    test_grids = [
+        {
+            'group_sizes': [8, 8, 8, 8],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [2, 14, 8, 8],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [16, 0, 8, 8],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [2, 0, 14, 16],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [8, 12, 0, 12],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [6, 12, 0, 14],
+            'm': 32,
+            'tm': 8
+        },
+        {
+            'group_sizes': [6, 12, 0, 14],
+            'm': 32,
+            'tm': 4
+        },
+        {
+            'group_sizes': [377, 588, 153, 1638, 3261, 5890, 996, 3481],
+            'm': 16384,
+            'tm': 128
+        },
+    ]
+
+    for test_grid in test_grids:
+      jax_meta, jax_num_tiles = jax_make_group_metadata(
+          group_sizes=jnp.array(test_grid['group_sizes']),
+          m=test_grid['m'],
+          tm=test_grid['tm'],
+          start_group=0,
+          num_nonzero_groups=len(test_grid['group_sizes']),
+      )
+
+      torch_meta = _make_group_metadata(
+          group_sizes=torch.tensor(test_grid['group_sizes']).to(
+              torch.int32).to("xla"),
+          m=test_grid['m'],
+          tm=test_grid['tm'],
+          visit_empty_groups=True,
+      )
+
+      for i in range(len(jax_meta)):
+        self.assertTrue(
+            torch.all(
+                torch.from_numpy(np.array(jax_meta[i])) == torch_meta[i].cpu()))
+      self.assertEqual(jax_num_tiles, torch_meta[-1].cpu().item())
+
+    # Make sure _make_group_metadata doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+  def test_histogram(self):
+    test_grids = [
+        {
+            'input': [1, 4, 4, 1, 2, 3],
+            'min': 1,
+            'max': 4,
+        },
+        {
+            'input': [1, 4, 4, 1, 2, 3],
+            'min': 2,
+            'max': 3,
+        },
+        {
+            'input': [1, 4, 4, 1, 2, 3],
+            'min': 0,
+            'max': 5,
+        },
+        {
+            'input': [],
+            'min': 0,
+            'max': 5,
+        },
+    ]
+
+    for test_grid in test_grids:
+      torch_chart = torch.histc(
+          torch.tensor(test_grid['input'], dtype=torch.float),
+          bins=test_grid['max'] - test_grid['min'] + 1,
+          min=test_grid['min'],
+          max=test_grid['max'],
+      )
+
+      chart = _histogram(
+          torch.tensor(test_grid['input'], dtype=torch.int32).to("xla"),
+          min=test_grid['min'],
+          max=test_grid['max'],
+      )
+
+      self.assertTrue(torch.all(torch_chart == chart.cpu()))
+
+  def test_histogram_raise(self):
+    with self.assertRaisesRegex(AssertionError,
+                                "input must be of torch.int32 dtype."):
+      _histogram(
+          torch.tensor([1, 4, 4, 1, 2, 3], dtype=torch.float),
+          min=4,
+          max=5,
+      )
+
+    with self.assertRaisesRegex(AssertionError,
+                                "min must be less than or equal to max."):
+      _histogram(
+          torch.tensor([1, 4, 4, 1, 2, 3], dtype=torch.int32),
+          min=4,
+          max=3,
+      )
+
+  def test_sorting_input(self):
+    met.clear_all()
+    top2 = torch.tensor([[0, 2], [1, 3], [1, 2], [2, 3]]).to("xla")
+
+    # We want to create one big batch of tokens that has all top-k choices in it.
+    # Our tokens will thus be duplicated k-times in the batch. To do this we,
+    # first flatten the expert choices list and argsort it. This gives us an array
+    # of length B * K. We then create a tiled arange of size B * K and index
+    # into the expert choices list. This will give us the set of indices we need
+    # to gather from the xs to create this big batch.
+    top_flat = top2.flatten()
+    lhs_order = top_flat.argsort()
+    lhs_reverse_order = lhs_order.argsort()
+    lhs_indices = torch.arange(
+        top2.shape[0], device="xla").repeat_interleave(2)[lhs_order]
+    group_sizes = _histogram(top_flat.to(torch.int32), 0, 3)
+    xm.mark_step()
+
+    # Make sure it doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+    self.assertTrue(
+        torch.all(lhs_indices == torch.tensor([0, 1, 2, 0, 3, 2, 1, 3],
+                                              device="xla")))
+    self.assertTrue(
+        torch.all(group_sizes == torch.tensor([1, 2, 3, 2], device="xla")))
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_tgmm(self):
+    met.clear_all()
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = test_case['dtype']
+
+      lhs = torch.rand(k, m, dtype=lhs_dtype)
+      rhs = torch.rand(m, n, dtype=rhs_dtype)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      ref_out = self._reference_tgmm(lhs, rhs, group_sizes)
+
+      out = tgmm(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+
+    # Make sure tgmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_tgmm_bf16(self):
+    met.clear_all()
+
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      lhs = torch.rand(k, m, dtype=lhs_dtype)
+      rhs = torch.rand(m, n, dtype=rhs_dtype)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      ref_out = self._reference_tgmm(lhs, rhs, group_sizes)
+
+      out = tgmm(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+
+    # Make sure tgmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm_backward(self):
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      lhs = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype, requires_grad=True)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      lhs.retain_grad()
+      rhs.retain_grad()
+
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+      ref_out.sum().backward()
+
+      ref_out_backward = torch.ones_like(ref_out)
+      grad_lhs, grad_rhs = gmm_backward(
+          ref_out_backward.to("xla"), lhs.to("xla"), rhs.to("xla"),
+          group_sizes.to("xla"))
+
+      self.assertTrue(torch.allclose(lhs.grad, grad_lhs.cpu()))
+      self.assertTrue(torch.allclose(rhs.grad, grad_rhs.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm_backward_2(self):
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      torch.manual_seed(42)
+      lhs = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype, requires_grad=True)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      lhs.retain_grad()
+      rhs.retain_grad()
+
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+      ref_out.sum().backward()
+
+      torch.manual_seed(42)
+      lhs_xla = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True).to("xla")
+      rhs_xla = torch.rand(
+          num_groups, k, n, dtype=rhs_dtype, requires_grad=True).to("xla")
+      lhs_xla.retain_grad()
+      rhs_xla.retain_grad()
+
+      out = GMM.apply(lhs_xla, rhs_xla, group_sizes.to("xla"))
+      out.sum().backward()
+
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+      self.assertTrue(torch.allclose(lhs.grad, lhs_xla.grad.cpu()))
+      self.assertTrue(torch.allclose(rhs.grad, rhs_xla.grad.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm_backward_3(self):
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      torch.manual_seed(42)
+      lhs = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype, requires_grad=True)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      lhs.retain_grad()
+      rhs.retain_grad()
+
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+      ref_out.sum().backward()
+
+      torch.manual_seed(42)
+      lhs_xla = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True).to("xla")
+      rhs_xla = torch.rand(
+          num_groups, k, n, dtype=rhs_dtype, requires_grad=True).to("xla")
+      lhs_xla.retain_grad()
+      rhs_xla.retain_grad()
+
+      out = GMM.apply(lhs_xla, rhs_xla, group_sizes.to("xla"))
+      grad_out = torch.ones_like(out)
+      torch.autograd.backward([out], [grad_out, lhs_xla, rhs_xla])
+
+      self.assertTrue(torch.allclose(ref_out, out.cpu()))
+      self.assertTrue(torch.allclose(lhs.grad, lhs_xla.grad.cpu()))
+      self.assertTrue(torch.allclose(rhs.grad, rhs_xla.grad.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  torch.set_default_dtype(torch.float32)
+  torch.manual_seed(42)
+  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
+      use_full_mat_mul_precision=True)
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_input_output_aliases.py b/test/test_input_output_aliases.py
index c7c04f781c3..b2c5fc50b21 100644
--- a/test/test_input_output_aliases.py
+++ b/test/test_input_output_aliases.py
@@ -38,6 +38,50 @@ def test_aliasing_with_cloned(self):
     torch.allclose(t1 - 1, t1_cloned)
     self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
 
+  def test_aliasing_across_mark_step(self):
+    xla_device = xm.xla_device()
+    met.clear_all()
+    t1 = torch.randn(4, 5).to(xla_device)
+    t1 += 1
+    xm.mark_step()
+    self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+    t1 *= 100
+    xm.mark_step()
+    self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 2.0)
+
+  def test_aliasing_with_multiple_inplace_update(self):
+    BATCH_SIZE = 1
+    SEQ_LEN = 128
+    NUM_KV_HEADS = 16
+    HEAD_SIZE = 256
+    BLOCK_SIZE = 16
+    DTYPE = torch.bfloat16
+    num_blocks = 1024
+    device = xm.xla_device()
+    key = torch.randn(
+        BATCH_SIZE * SEQ_LEN,
+        NUM_KV_HEADS,
+        HEAD_SIZE,
+        device=device,
+        dtype=DTYPE)
+    k_cache = torch.randn(
+        num_blocks * BLOCK_SIZE,
+        NUM_KV_HEADS,
+        HEAD_SIZE,
+        device=device,
+        dtype=DTYPE)
+    slot_mapping = torch.randint(
+        0, num_blocks, (BATCH_SIZE, SEQ_LEN), device=device, dtype=torch.int64)
+    # materalize k_cache to device data
+    xm.mark_step()
+    met.clear_all()
+    for _ in range(10):
+      k_cache.index_copy_(0, slot_mapping.flatten(), key)
+    xm.mark_step()
+    xm.wait_device_ops()
+    self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+    torch.allclose(k_cache[slot_mapping[0][0]].cpu(), key[0].cpu())
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/test_metrics.py b/test/test_metrics.py
index 87c45949b32..409876d8d9d 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -9,6 +9,11 @@
 import unittest
 
 
+def XLAExperimentalContains(feat):
+  experimental = os.environ.get("XLA_EXPERIMENTAL", "").split(":")
+  return feat in experimental
+
+
 class MetricsTest(unittest.TestCase):
 
   def test_clear_counters(self):
@@ -205,6 +210,55 @@ def test_pybind_increment_counter(self):
     torch_xla._XLAC._xla_increment_counter('FakeCounter', 2)
     self.assertEqual(met.counter_value('FakeCounter'), 2)
 
+  def test_get_fallback_ops(self):
+
+    def getAndAssertFallbackOpsLenEquals(count):
+      fallback_ops = met.executed_fallback_ops()
+      fallback_ops_number = len(fallback_ops)
+      self.assertEqual(
+          fallback_ops_number,
+          count,
+          msg=f"found {fallback_ops_number}: {fallback_ops}")
+      return fallback_ops
+
+    # Reset all metrics, and make sure we don't start with any fallback ops.
+    met.clear_all()
+    getAndAssertFallbackOpsLenEquals(0)
+
+    # Create N boxes in the format XYXY.
+    # This should not run any fallback ops.
+    N = 10
+    x = torch.rand(N, 1).to(xm.xla_device())
+    y = torch.rand(N, 1).to(xm.xla_device())
+    width = torch.rand(N, 1).to(xm.xla_device())
+    height = torch.rand(N, 1).to(xm.xla_device())
+    xys = torch.cat((x, x + width, y, y - height), dim=1)
+    getAndAssertFallbackOpsLenEquals(0)
+
+    # tensor.item() is a fallback operation.
+    xys[0, 0].item()
+    ops = getAndAssertFallbackOpsLenEquals(1)
+    self.assertEqual(ops[0], "aten::_local_scalar_dense")
+
+    # Reset all metrics, and make sure we also don't retrieve any
+    # fallback operations.
+    met.clear_all()
+    getAndAssertFallbackOpsLenEquals(0)
+
+    if not XLAExperimentalContains("nms"):
+      # Run torchvision operations as fallback.
+      import torchvision
+      scores = torch.rand(N).to(xm.xla_device())
+      # NMS doesn't have a PyTorch/XLA implementation without dynamic shapes.
+      torchvision.ops.nms(xys, scores, 0.5)
+      # remove_small_boxes is not implemented in C++. It calls other PyTorch
+      # operations. One of them, nonzero, is a fallback operation.
+      torchvision.ops.remove_small_boxes(
+          xys, torch.median(torch.stack((width, height))))
+      ops = getAndAssertFallbackOpsLenEquals(3)
+      self.assertEqual(
+          set(ops), {"aten::nonzero", "aten::median", "torchvision::nms"})
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/test_operations.py b/test/test_operations.py
index 7fb9f5bc3e3..b4a1838a5a5 100644
--- a/test/test_operations.py
+++ b/test/test_operations.py
@@ -13,6 +13,7 @@
 parser.add_argument('--verbosity', type=int, default=0)
 FLAGS, leftovers = parser.parse_known_args()
 sys.argv = [sys.argv[0]] + leftovers
+from absl.testing import absltest, parameterized
 
 # Normal imports section starts here.
 import collections
@@ -28,6 +29,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from torch.testing._internal.common_device_type import dtypes
+from torch.testing._internal.common_dtype import (
+    all_types_and_complex_and,
+    all_types_and,
+)
 import torch_xla
 import torch_xla.core.xla_builder as xb
 import torch_xla.core.xla_op_registry as xor
@@ -40,6 +46,7 @@
 import torch_xla.distributed.spmd as xs
 from torch_xla import runtime as xr
 import torch_xla.test.test_utils as xtu
+import torch_xla.utils.dlpack as xdlpack
 import torch_xla.utils.utils as xu
 import torch_xla.utils.serialization as xser
 import torch_xla.core.xla_model as xm
@@ -88,6 +95,12 @@ def onlyOnCUDA(fn):
   return unittest.skipIf(accelerator != "cuda", "PJRT_DEVICE=CUDA required")(fn)
 
 
+def onlyIfXLAExperimentalContains(feat):
+  experimental = os.environ.get("XLA_EXPERIMENTAL", "").split(":")
+  return unittest.skipIf(feat not in experimental,
+                         f"XLA_EXPERIMENTAL={feat} required")
+
+
 def _gen_tensor(*args, **kwargs):
   return torch.randn(*args, **kwargs)
 
@@ -1361,6 +1374,19 @@ def test_fn(t, c):
         ), dtype=torch.int64)
     self.runAtenTest([token_type_ids, cat_ids], test_fn)
 
+  def test_one_hot_no_fallback(self):
+
+    def test_fn(t):
+      met.clear_all()
+      res = F.one_hot(t, num_classes=5)
+      # make sure there is no graph break
+      assert 'aten::' not in met.short_metrics_report()
+      return res
+
+    t1 = torch.arange(0, 5) % 3
+
+    self.runAtenTest([t1], test_fn)
+
   @skipIfFunctionalizationEnabled("views do not exist")
   def test_save_view_alias_check(self):
 
@@ -1976,6 +2002,86 @@ def foo(x):
     for dtype in test_dtypes:
       test(dtype)
 
+  def test_gelu_backward_different_types(self):
+
+    def foo(grad, inp):
+      return torch.ops.aten.gelu_backward.default(grad, inp)
+
+    grad = torch.rand(10, 10, dtype=torch.bfloat16)
+    inp = torch.rand(10, 10)
+
+    Xgrad = grad.to(xm.xla_device())
+    Xinp = inp.to(xm.xla_device())
+
+    r = foo(grad, inp)
+    Xr = foo(Xgrad, Xinp)
+
+    self.assertEqual(r, Xr.cpu())
+
+  def test_stack_different_types(self):
+
+    def foo(t0, t1):
+      return torch.stack([t0, t1])
+
+    t0 = torch.rand(10, 10, dtype=torch.bfloat16)
+    t1 = torch.rand(10, 10)
+
+    Xt0 = t0.to(xm.xla_device())
+    Xt1 = t1.to(xm.xla_device())
+
+    r = foo(t0, t1)
+    Xr = foo(Xt0, Xt1)
+
+    self.assertEqual(r, Xr.cpu())
+
+  def test_index_zero_tensor_by_zero_tensor(self):
+
+    # Test if simple one-tensor indexing works.
+    # Should return a non-permuted tensor.
+    def f1(x, i):
+      return x[i]
+
+    # Test if scattered two-tensor indexing works.
+    # Should return a permuted tensor, with indexed dimensions first.
+    def f2(x, i0, i1):
+      return x[:, i0, :, i1]
+
+    cases = {
+        f1: [
+            ((0,), (0,)),
+            ((0, 10), (0, 5, 5)),
+            ((0, 3, 3), (5, 5, 0)),
+        ],
+        f2: [
+            ((10, 0, 10, 10), (5, 0, 5), (5, 1, 1)),
+            ((0, 0, 10, 0), (5, 5, 0), (5, 5, 1)),
+        ]
+    }
+
+    def make_tensor(shape):
+      return torch.rand(shape)
+
+    def make_index(shape):
+      return torch.randint(0, 100, shape, dtype=torch.long)
+
+    def test(f, xshape, ishapes):
+      x = make_tensor(xshape)
+      ilist = [make_index(s) for s in ishapes]
+
+      Xx = x.to(xm.xla_device())
+      Xilist = [i.to(xm.xla_device()) for i in ilist]
+
+      out = f(x, *ilist)
+      Xout = f(Xx, *Xilist)
+
+      self.assertEqual(out, Xout.cpu())
+
+    for xshape, ishape in cases[f1]:
+      test(f1, xshape, (ishape,))
+
+    for xshape, i0shape, i1shape in cases[f2]:
+      test(f2, xshape, (i0shape, i1shape))
+
 
 class MNISTComparator(nn.Module):
 
@@ -2416,6 +2522,195 @@ def test_aten_move_scalar_cuda_to_xla(self):
     # Has a different execution path than other tensors.
     self._test_move_tensor_cuda_to_xla(torch.tensor(42))
 
+  def test_unsafe_buffer_pointer(self):
+    xla_device = xm.xla_device()
+    xla_tensor_0 = torch.tensor(42).to(xla_device)
+    # `mark_step` ensures xtensor->CurrentDataHandle() != nullptr
+    xm.mark_step()
+    buf_ptr_0 = torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor_0)
+    self.assertGreaterEqual(buf_ptr_0, 0)
+
+    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
+    xla_tensor_1 = torch.tensor(42, device=xm.xla_device())
+    buf_ptr_1 = torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor_1)
+    self.assertGreaterEqual(buf_ptr_1, 0)
+
+    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
+    xla_tensor_2 = torch.ones((5, 5)).to(xla_device)
+    buf_ptr_2 = torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor_2)
+    self.assertGreaterEqual(buf_ptr_2, 0)
+
+    xla_tensor_3 = torch.arange(5, device=xm.xla_device())
+    xm.mark_step()
+    # Without the `wait_device_ops()`, the pjrt buffer (pjrt_data->buffer) at https://github.com/pytorch/xla/blob/e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd/torch_xla/csrc/runtime/pjrt_computation_client.cc#L467 will be nullptr.
+    xm.wait_device_ops()
+    buf_ptr_3 = torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor_3)
+    self.assertGreaterEqual(buf_ptr_3, 0)
+
+
+class TestDLPack(parameterized.TestCase):
+
+  def _test_dlpack_capsule_conversion_helper(self, xla_tensor):
+    dlpt = xdlpack.to_dlpack(xla_tensor)  # dlpt1 has type PyCapsule
+    xla_tensor2 = xdlpack.from_dlpack(dlpt)
+
+    self.assertEqual(xla_tensor.device, xla_tensor2.device)
+    self.assertTrue(torch.allclose(xla_tensor.cpu(), xla_tensor2.cpu()))
+    self.assertRaisesRegex(RuntimeError,
+                           "DLTensor capsule can be consumed only once",
+                           lambda: xdlpack.from_dlpack(dlpt))
+
+    self.assertEqual(
+        torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor),
+        torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor2))
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  @parameterized.parameters(*all_types_and(torch.half, torch.bfloat16))
+  def test_dlpack_roundtrip_tensor(self, dtype):
+    xla_device = xm.xla_device()
+    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
+    # xla_tensor_2 uses XLANativeFunctions::_to_copy
+    xla_tensor_2 = torch.arange(5, dtype=dtype).to(xla_device)
+    self._test_dlpack_capsule_conversion_helper(xla_tensor_2)
+
+    # xla_tensor_3 uses arange_out IR node.
+    xla_tensor_3 = torch.arange(5, dtype=dtype, device=xm.xla_device())
+    xm.mark_step()
+    self._test_dlpack_capsule_conversion_helper(xla_tensor_3)
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  @parameterized.parameters(*all_types_and_complex_and(torch.half,
+                                                       torch.bfloat16,
+                                                       torch.bool, torch.uint16,
+                                                       torch.uint32,
+                                                       torch.uint64))
+  def test_dlpack_roundtrip_scalar(self, dtype):
+    xla_device = xm.xla_device()
+    xla_tensor_0 = torch.tensor(42, dtype=dtype).to(xla_device)
+    # `mark_step` ensures xtensor->CurrentDataHandle() != nullptr
+    xm.mark_step()
+    self._test_dlpack_capsule_conversion_helper(xla_tensor_0)
+
+    xla_tensor_1 = torch.tensor(42, dtype=dtype).to(xla_device)
+    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
+    self._test_dlpack_capsule_conversion_helper(xla_tensor_1)
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  def test_dlpack_roundtrip_bool(self):
+    xla_tensor = torch.ones(1, dtype=torch.bool).to(xm.xla_device())
+    self._test_dlpack_capsule_conversion_helper(xla_tensor)
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  def test_dlpack_pytorch_cuda_to_xla(self):
+    t1_cuda = torch.arange(5).cuda()
+    dlt1 = torch.utils.dlpack.to_dlpack(t1_cuda)
+    xla_t1 = xdlpack.from_dlpack(dlt1)
+    self.assertEqual(xla_t1.device.type, 'xla')
+    self.assertEqual(xla_t1.device.index, t1_cuda.device.index)
+    t1_cuda[0] = t1_cuda[0] + 20
+    self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu()))
+
+    t2_cuda = torch.tensor(5).cuda()
+    dlt2 = torch.utils.dlpack.to_dlpack(t2_cuda)
+    xla_t2 = xdlpack.from_dlpack(dlt2)
+    self.assertEqual(xla_t2.device.type, 'xla')
+    self.assertEqual(xla_t2.device.index, t2_cuda.device.index)
+    t2_cuda.fill_(6)
+    self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
+
+    cuda1 = torch.device('cuda:1')
+    t3_cuda = torch.tensor(5, device=cuda1)
+    dlt3 = torch.utils.dlpack.to_dlpack(t3_cuda)
+    xla_t3 = xdlpack.from_dlpack(dlt3)
+    self.assertEqual(xla_t3.device.type, 'xla')
+    self.assertEqual(
+        xla_t3.device.index,
+        t3_cuda.device.index,
+        msg='both value should 1. xla_t3.device should be xla:1.')
+    t3_cuda.fill_(6)
+    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  def test_dlpack_pytorch_cuda_to_xla_protocol_conversion(self):
+    # Unlike the test_dlpack_pytorch_cuda_to_xla,
+    # torch_cuda_tensor has attribute __dlpack__ and __dlpack_device__.
+    # From cuda tensors to xla tensors, the synchronization is handdled implicitly.
+    t1_cuda = torch.arange(5).cuda()
+    xla_t1 = xdlpack.from_dlpack(t1_cuda)
+    self.assertEqual(xla_t1.device.type, 'xla')
+    self.assertEqual(xla_t1.device.index, t1_cuda.device.index)
+    t1_cuda[0] = t1_cuda[0] + 20
+    self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu()))
+
+    t2_cuda = torch.tensor(5).cuda()
+    xla_t2 = xdlpack.from_dlpack(t2_cuda)
+    self.assertEqual(xla_t2.device.type, 'xla')
+    self.assertEqual(xla_t2.device.index, t2_cuda.device.index)
+    t2_cuda.fill_(6)
+    self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
+
+    cuda1 = torch.device('cuda:1')
+    t3_cuda = torch.tensor(5, device=cuda1)
+    xla_t3 = xdlpack.from_dlpack(t3_cuda)
+    self.assertEqual(xla_t3.device.type, 'xla')
+    self.assertEqual(
+        xla_t3.device.index,
+        t3_cuda.device.index,
+        msg='both value should 1. xla_t3.device should be xla:1.')
+    t3_cuda.fill_(6)
+    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  def test_dlpack_xla_to_pytorch_cuda(self):
+    xla_t1 = torch.arange(5).to(xm.xla_device())
+    dlt1 = xdlpack.to_dlpack(xla_t1)
+    cuda_t1 = torch.utils.dlpack.from_dlpack(dlt1)
+    self.assertEqual(cuda_t1.device.type, 'cuda')
+    self.assertEqual(cuda_t1.device.index, xla_t1.device.index)
+    cuda_t1[0] = cuda_t1[0] + 20
+    self.assertTrue(torch.allclose(xla_t1.cpu(), cuda_t1.cpu()))
+
+  @onlyIfTorchSupportsCUDA
+  @onlyIfPJRTDeviceIsCUDA
+  def test_dlpack_non_default_layout(self):
+    cuda_t = torch.arange(25, device=torch.device('cuda')).reshape(5, 5)
+
+    t1 = cuda_t.t()
+    xla_t1 = xdlpack.from_dlpack(t1.__dlpack__())
+    self.assertEqual(xla_t1.device.type, 'xla')
+    self.assertEqual(xla_t1.device.index, t1.device.index)
+    self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu()))
+
+    t2 = cuda_t[0]
+    xla_t2 = xdlpack.from_dlpack(t2.__dlpack__())
+    self.assertEqual(xla_t2.device.type, 'xla')
+    self.assertEqual(xla_t2.device.index, t2.device.index)
+    self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu()))
+
+    t3 = cuda_t[:, 0]
+    self.assertRaisesRegex(
+        RuntimeError,
+        r"Only DLPack tensors with trivial \(compact\) striding are supported",
+        lambda: xdlpack.from_dlpack(t3.__dlpack__()))
+
+    t4 = cuda_t[1, :]
+    xla_t4 = xdlpack.from_dlpack(t4.__dlpack__())
+    self.assertEqual(xla_t4.device.type, 'xla')
+    self.assertEqual(xla_t4.device.index, t4.device.index)
+    self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu()))
+
+    t5 = cuda_t[1]
+    xla_t5 = xdlpack.from_dlpack(t5.__dlpack__())
+    self.assertEqual(xla_t5.device.type, 'xla')
+    self.assertEqual(xla_t5.device.index, t5.device.index)
+    self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu()))
+
 
 class SimpleModelWithDropout(torch.nn.Module):
 
@@ -2454,6 +2749,7 @@ def test_dropout(self):
 
 # These tests were extracted and adapted from torchvision.
 # Source: vision/test/test_ops.py
+@onlyIfXLAExperimentalContains("nms")
 class TestNMS(test_utils.XlaTestCase):
 
   def _reference_nms(self, boxes, scores, iou_threshold):
@@ -2530,6 +2826,54 @@ def fn(boxes, scores):
     self.runAtenTest((boxes, scores), fn)
 
 
+class TestHelperFunction(test_utils.XlaTestCase):
+
+  def test_repeat_truncated(self):
+    from torch_xla.experimental.custom_kernel import repeat_with_fixed_output_size
+    met.clear_all()
+    device = torch_xla.device()
+    total_repeat_length = 20
+    input = torch.randn(10).to(device)
+    repeats = torch.tensor([0, 1, 2, 0, 4, 0, 6, 7, 8, 9]).to(device)
+    res = repeat_with_fixed_output_size(input, repeats, total_repeat_length)
+    # make sure there is no graph break
+    assert 'aten::' not in met.short_metrics_report()
+    expected = torch.repeat_interleave(input, repeats)[:total_repeat_length]
+    self.assertTrue(torch.allclose(res.cpu(), expected.cpu()))
+
+  def test_repeat_extended(self):
+    from torch_xla.experimental.custom_kernel import repeat_with_fixed_output_size
+    met.clear_all()
+    device = torch_xla.device()
+    total_repeat_length = 100
+    input = torch.randn(10).to(device)
+    repeats = torch.tensor([0, 5, 2, 0, 4, 9, 6, 7, 8, 0]).to(device)
+    res = repeat_with_fixed_output_size(input, repeats, total_repeat_length)
+    # make sure there is no graph break
+    assert 'aten::' not in met.short_metrics_report()
+    base = torch.repeat_interleave(input, repeats)[:total_repeat_length]
+    # remaining space will be filled with last value in `input`.
+    expected = torch.cat(
+        (base,
+         torch.repeat_interleave(input[-1],
+                                 total_repeat_length - base.size()[0])))
+    self.assertTrue(torch.allclose(res.cpu(), expected.cpu()))
+
+  def test_repeat_special(self):
+    from torch_xla.experimental.custom_kernel import repeat_with_fixed_output_size
+    met.clear_all()
+    device = torch_xla.device()
+    total_repeat_length = 135
+    num_groups = 8
+    input = torch.arange(num_groups, dtype=torch.int32).to(device)
+    repeats = torch.tensor([3, 6, 2, 14, 27, 47, 8, 28]).to(device)
+    res = repeat_with_fixed_output_size(input, repeats, total_repeat_length)
+    # make sure there is no graph break
+    assert 'aten::' not in met.short_metrics_report()
+    expected = torch.repeat_interleave(input, repeats)[:total_repeat_length]
+    self.assertTrue(torch.allclose(res.cpu(), expected.cpu()))
+
+
 if __name__ == '__main__':
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
diff --git a/test/test_ops.py b/test/test_ops.py
index 12b874593bd..3b098e85f93 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -225,6 +225,7 @@ def __new__(cls, name, variant_test_name=""):
             AllowedOpInfoEntry('norm', 'fro'),
             AllowedOpInfoEntry('special.erfcx'),
             AllowedOpInfoEntry('_native_batch_norm_legit'),
+            AllowedOpInfoEntry('full'),
 
             # Duplicate Redundant entries for this test.
             # AllowedOpInfoEntry('polygamma', 'polygamma_n_1'),
@@ -393,7 +394,7 @@ def _cpu(t):
           return tuple(map(to_cpu, x))
         elif isinstance(x, dict):
           return {k: to_cpu(v) for k, v in x.items()}
-        elif isinstance(x, (numbers.Number, bool, str)):
+        elif isinstance(x, (numbers.Number, bool, str, torch.dtype)):
           return x
 
         # Passthrough None because some functions wrapped with type promotion
@@ -426,5 +427,4 @@ def test_reference_eager(self, device, dtype, op):
 instantiate_device_type_tests(TestOpInfo, globals())
 
 if __name__ == '__main__':
-  #run_tests()
   unittest.main()
diff --git a/test/test_pallas.py b/test/test_pallas.py
index 2902b5e21ba..25c487912cf 100644
--- a/test/test_pallas.py
+++ b/test/test_pallas.py
@@ -10,6 +10,8 @@
 from torch_xla import runtime as xr
 from torch_xla._internal import tpu
 
+import numpy as np
+
 if xr.device_type() == 'TPU':
   from torch_xla.experimental.custom_kernel import jax_import_guard
   jax_import_guard()
@@ -20,12 +22,52 @@
 
 class PallasTest(unittest.TestCase):
 
-  def _attention(self, q, k, v):
+  # This is to create a diagonal mask where only elements within the same segment
+  # can attend to each other. Since the mask is to mask out the unrelevant parts,
+  # therefore we use != instead of ==.
+  def _make_attention_mask_from_segment_ids(self, q_segment_ids,
+                                            kv_segment_ids):
+    return q_segment_ids.view(q_segment_ids.shape[0], 1,
+                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
+                                  kv_segment_ids.shape[0], 1, 1,
+                                  kv_segment_ids.shape[1])
+
+  def _attention(self, q, k, v, *, attn_mask=None):
     attn_weight = q @ k.transpose(-2, -1)
+    if attn_mask is not None:
+      # Masked out the unrelevant parts.
+      attn_weight = attn_weight.masked_fill(attn_mask,
+                                            torch.finfo(attn_weight.dtype).min)
     attn_weight = nn.functional.softmax(attn_weight, dim=-1)
     attn_output = attn_weight @ v
     return attn_output
 
+  # The following helper functions prefixed with _pagedattention are used for PagedAttention unit tests
+  # Reference: https://github.com/google/jax/blob/main/tests/pallas/paged_attention_kernel_test.py
+  def _pagedattention_generate_qkv(
+      self,
+      seq_lens,
+      page_size,
+      max_seq_len,
+      num_kv_heads,
+      num_heads,
+      head_dim,
+      dtype=torch.float32,
+  ):
+    assert max_seq_len % page_size == 0
+    pages_per_sequence = max_seq_len // page_size
+    batch_size = len(seq_lens)
+    total_pages = batch_size * pages_per_sequence
+    k_pages = torch.randn(
+        num_kv_heads, total_pages, page_size, head_dim, dtype=dtype)
+    v_pages = torch.randn(
+        num_kv_heads, total_pages, page_size, head_dim, dtype=dtype)
+    page_indices = torch.randperm(
+        batch_size * pages_per_sequence, dtype=torch.int32)
+    page_indices = page_indices.reshape(batch_size, pages_per_sequence)
+    q = torch.randn(batch_size, num_heads, head_dim, dtype=dtype)
+    return q, k_pages, v_pages, page_indices
+
   @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
   def test_tpu_custom_call_pallas_add(self):
     # This payload is generated by the following Pallas code:
@@ -417,6 +459,7 @@ def test__flash_attention_bwd_dkv(self):
   @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
                    "This test only works on TPUv3+.")
   def test_flash_attention_backward(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
     from torch_xla.experimental.custom_kernel import flash_attention
 
     torch.manual_seed(42)
@@ -449,9 +492,375 @@ def test_flash_attention_backward(self):
     loss.backward()
     xm.mark_step()
 
-    mse = torch.nn.MSELoss()
     for i in [(q, q_grad), (k, k_grad), (v, v_grad)]:
-      self.assertTrue(mse(i[0].grad.cpu(), i[1].cpu()) < 1e-4)
+      self.assertTrue(torch.allclose(i[0].grad.cpu(), i[1].cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
+                   "This test only works on TPUv4+.")
+  def test_paged_attention_wrapper(self):
+    from torch_xla.experimental.custom_kernel import paged_attention
+    from jax.experimental.pallas.ops.tpu.paged_attention.paged_attention_kernel import paged_attention as jax_paged_attention
+
+    max_kv_len = 2048
+    block_size = 512
+    page_size = 64
+    num_kv_heads = 8
+    q_kv_head_ratio = 8
+    head_dim = 256
+    dtype = torch.float32
+    seq_lens = torch.tensor([0, 3, 256, 513, 1023, 2048], dtype=torch.int32)
+
+    q, k_pages, v_pages, page_indices = self._pagedattention_generate_qkv(
+        seq_lens,
+        page_size,
+        max_kv_len,
+        num_kv_heads,
+        num_kv_heads * q_kv_head_ratio,
+        head_dim,
+    )
+
+    q_xla = q.to("xla")
+    k_pages_xla = k_pages.to("xla")
+    v_pages_xla = v_pages.to("xla")
+    seq_lens_xla = seq_lens.to("xla")
+    page_indices_xla = page_indices.to("xla")
+
+    output = paged_attention(
+        q_xla,
+        k_pages_xla,
+        v_pages_xla,
+        seq_lens_xla,
+        page_indices_xla,
+        pages_per_compute_block=block_size // page_size,
+    )
+
+    q_jax = jnp.array(q.numpy(), dtype=jnp.float32)
+    k_pages_jax = jnp.array(k_pages.numpy(), dtype=jnp.float32)
+    v_pages_jax = jnp.array(v_pages.numpy(), dtype=jnp.float32)
+    seq_lens_jax = jnp.array(seq_lens.numpy(), dtype=jnp.int32)
+    page_indices_jax = jnp.array(page_indices.numpy(), dtype=jnp.int32)
+    expected_output = torch.from_numpy(
+        np.array(
+            jax_paged_attention(
+                q_jax,
+                k_pages_jax,
+                v_pages_jax,
+                seq_lens_jax,
+                page_indices_jax,
+                pages_per_compute_block=block_size // page_size,
+            )))
+
+    self.assertTrue(
+        torch.allclose(
+            output.cpu()[seq_lens > 0],
+            expected_output.cpu()[seq_lens > 0],
+            atol=1e-5,
+            rtol=1e-5))
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
+                   "This test only works on TPUv4+.")
+  def test_paged_attention_wrapper_with_megacore_modes(self):
+    from torch_xla.experimental.custom_kernel import paged_attention
+    from jax.experimental.pallas.ops.tpu.paged_attention.paged_attention_kernel import paged_attention as jax_paged_attention
+
+    max_kv_len = 2048
+    block_size = 512
+    page_size = 64
+    num_kv_heads = 8
+    q_kv_head_ratio = 8
+    head_dim = 256
+    dtype = torch.float32
+    seq_lens = torch.tensor([0, 3, 256, 513, 1023, 2048], dtype=torch.int32)
+
+    q, k_pages, v_pages, page_indices = self._pagedattention_generate_qkv(
+        seq_lens,
+        page_size,
+        max_kv_len,
+        num_kv_heads,
+        num_kv_heads * q_kv_head_ratio,
+        head_dim,
+    )
+
+    q_xla = q.to("xla")
+    k_pages_xla = k_pages.to("xla")
+    v_pages_xla = v_pages.to("xla")
+    seq_lens_xla = seq_lens.to("xla")
+    page_indices_xla = page_indices.to("xla")
+
+    outputs = []
+    for megacore_mode in ['kv_head', 'batch', None]:
+      outputs.append(
+          paged_attention(
+              q_xla,
+              k_pages_xla,
+              v_pages_xla,
+              seq_lens_xla,
+              page_indices_xla,
+              pages_per_compute_block=block_size // page_size,
+              megacore_mode=megacore_mode))
+
+    q_jax = jnp.array(q.numpy(), dtype=jnp.float32)
+    k_pages_jax = jnp.array(k_pages.numpy(), dtype=jnp.float32)
+    v_pages_jax = jnp.array(v_pages.numpy(), dtype=jnp.float32)
+    seq_lens_jax = jnp.array(seq_lens.numpy(), dtype=jnp.int32)
+    page_indices_jax = jnp.array(page_indices.numpy(), dtype=jnp.int32)
+    expected_outputs = []
+    for megacore_mode in ['kv_head', 'batch', None]:
+      expected_outputs.append(
+          torch.from_numpy(
+              np.array(
+                  jax_paged_attention(
+                      q_jax,
+                      k_pages_jax,
+                      v_pages_jax,
+                      seq_lens_jax,
+                      page_indices_jax,
+                      pages_per_compute_block=block_size // page_size,
+                      megacore_mode=megacore_mode))))
+
+    for output, expected_output in zip(outputs, expected_outputs):
+      self.assertTrue(
+          torch.allclose(
+              output.cpu()[seq_lens > 0],
+              expected_output.cpu()[seq_lens > 0],
+              atol=1e-5,
+              rtol=1e-5))
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
+                   "This test only works on TPUv4+.")
+  def test_paged_attention_wrapper_with_dynamo(self):
+    from torch_xla.experimental.custom_kernel import paged_attention
+    from jax.experimental.pallas.ops.tpu.paged_attention.paged_attention_kernel import paged_attention as jax_paged_attention
+
+    max_kv_len = 2048
+    block_size = 512
+    page_size = 64
+    num_kv_heads = 8
+    q_kv_head_ratio = 8
+    head_dim = 256
+    dtype = torch.float32
+    seq_lens = torch.tensor([0, 3, 256, 513, 1023, 2048], dtype=torch.int32)
+
+    q, k_pages, v_pages, page_indices = self._pagedattention_generate_qkv(
+        seq_lens,
+        page_size,
+        max_kv_len,
+        num_kv_heads,
+        num_kv_heads * q_kv_head_ratio,
+        head_dim,
+    )
+
+    q_xla = q.to("xla")
+    k_pages_xla = k_pages.to("xla")
+    v_pages_xla = v_pages.to("xla")
+    seq_lens_xla = seq_lens.to("xla")
+    page_indices_xla = page_indices.to("xla")
+
+    def paged_attention_wrapper(q, k, v, seq_lens, page_indices,
+                                pages_per_compute_block):
+      return torch.ops.xla.paged_attention(
+          q,
+          k,
+          v,
+          seq_lens,
+          page_indices,
+          pages_per_compute_block=pages_per_compute_block,
+      )
+
+    compiled_paged_attention = torch.compile(
+        paged_attention_wrapper, backend="openxla")
+
+    output = compiled_paged_attention(
+        q_xla,
+        k_pages_xla,
+        v_pages_xla,
+        seq_lens_xla,
+        page_indices_xla,
+        pages_per_compute_block=block_size // page_size,
+    )
+
+    q_jax = jnp.array(q.numpy(), dtype=jnp.float32)
+    k_pages_jax = jnp.array(k_pages.numpy(), dtype=jnp.float32)
+    v_pages_jax = jnp.array(v_pages.numpy(), dtype=jnp.float32)
+    seq_lens_jax = jnp.array(seq_lens.numpy(), dtype=jnp.int32)
+    page_indices_jax = jnp.array(page_indices.numpy(), dtype=jnp.int32)
+    expected_output = torch.from_numpy(
+        np.array(
+            jax_paged_attention(
+                q_jax,
+                k_pages_jax,
+                v_pages_jax,
+                seq_lens_jax,
+                page_indices_jax,
+                pages_per_compute_block=block_size // page_size,
+            )))
+
+    self.assertTrue(
+        torch.allclose(
+            output.cpu()[seq_lens > 0],
+            expected_output.cpu()[seq_lens > 0],
+            atol=1e-5,
+            rtol=1e-5))
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_wrapper_segment_ids_1(self):
+    from torch_xla.experimental.custom_kernel import flash_attention
+    from jax.experimental.pallas.ops.tpu.flash_attention import flash_attention as jax_flash_attention, SegmentIds
+
+    q = torch.randn(3, 2, 128, 4)
+    k = torch.randn(3, 2, 128, 4)
+    v = torch.randn(3, 2, 128, 4)
+    zeros = torch.zeros(3, 32)
+    segment_ids = torch.cat([zeros, zeros + 1, zeros + 2, zeros + 3], dim=1)
+    o = flash_attention(
+        q.to("xla"), k.to("xla"), v.to("xla"), False, segment_ids.to("xla"),
+        segment_ids.to("xla"))
+
+    jax_q = jnp.array(q.numpy(), dtype=jnp.float32)
+    jax_k = jnp.array(k.numpy(), dtype=jnp.float32)
+    jax_v = jnp.array(v.numpy(), dtype=jnp.float32)
+    jax_segment_ids = jnp.array(segment_ids.numpy(), dtype=jnp.float32)
+    expected_o = torch.from_numpy(
+        np.array(
+            jax_flash_attention(
+                jax_q,
+                jax_k,
+                jax_v,
+                segment_ids=SegmentIds(jax_segment_ids, jax_segment_ids),
+            )))
+
+    self.assertTrue(torch.allclose(o.cpu(), expected_o.cpu(), atol=1e-05))
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_wrapper_segment_ids_2(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    from torch_xla.experimental.custom_kernel import flash_attention
+
+    q = torch.randn(3, 2, 128, 4).to("xla")
+    k = torch.randn(3, 2, 128, 4).to("xla")
+    v = torch.randn(3, 2, 128, 4).to("xla")
+    zeros = torch.zeros(3, 32).to("xla")
+    segment_ids = torch.cat([zeros, zeros + 1, zeros + 2, zeros + 3], dim=1)
+    o = flash_attention(q, k, v, False, segment_ids, segment_ids)
+
+    expected_o = self._attention(
+        q,
+        k,
+        v,
+        attn_mask=self._make_attention_mask_from_segment_ids(
+            segment_ids, segment_ids))
+    self.assertTrue(torch.allclose(o.cpu(), expected_o.cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_backward_segment_ids(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    from torch_xla.experimental.custom_kernel import flash_attention
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    zeros = torch.zeros(4, 32).to("xla")
+    segment_ids = torch.cat([zeros, zeros + 1, zeros + 2, zeros + 3], dim=1)
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = flash_attention(q, k, v, False, segment_ids, segment_ids)
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    q_grad = q.grad
+    k_grad = k.grad
+    v_grad = v.grad
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    zeros = torch.zeros(4, 32).to("xla")
+    segment_ids = torch.cat([zeros, zeros + 1, zeros + 2, zeros + 3], dim=1)
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = self._attention(
+        q,
+        k,
+        v,
+        attn_mask=self._make_attention_mask_from_segment_ids(
+            segment_ids, segment_ids))
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    for i in [(q, q_grad), (k, k_grad), (v, v_grad)]:
+      self.assertTrue(torch.allclose(i[0].grad.cpu(), i[1].cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_wrapper_sm_scale(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    from torch_xla.experimental.custom_kernel import flash_attention
+
+    q = torch.randn(3, 2, 128, 4).to("xla")
+    k = torch.randn(3, 2, 128, 4).to("xla")
+    v = torch.randn(3, 2, 128, 4).to("xla")
+    sm_scale = 0.7
+    o = flash_attention(q, k, v, False, None, None, sm_scale)
+
+    expected_o = self._attention(q * sm_scale, k, v)
+    self.assertTrue(torch.allclose(o.cpu(), expected_o.cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_sm_scale_backward(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    from torch_xla.experimental.custom_kernel import flash_attention
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    sm_scale = 0.7
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = flash_attention(q, k, v, False, None, None, sm_scale)
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    q_grad = q.grad
+    k_grad = k.grad
+    v_grad = v.grad
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = self._attention(q * sm_scale, k, v)
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    # Hmm, the gradients are the same even the autograd graph seems different.
+    for i in [(q, q_grad), (k, k_grad), (v, v_grad)]:
+      self.assertTrue(torch.allclose(i[0].grad.cpu(), i[1].cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
 
 
 if __name__ == '__main__':
diff --git a/test/test_pallas_spmd.py b/test/test_pallas_spmd.py
new file mode 100644
index 00000000000..33434594191
--- /dev/null
+++ b/test/test_pallas_spmd.py
@@ -0,0 +1,110 @@
+import logging
+import os
+import unittest
+
+import torch
+from torch import nn as nn
+
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
+from torch_xla import runtime as xr
+from torch_xla._internal import tpu
+
+if xr.device_type() == 'TPU':
+  from torch_xla.experimental.custom_kernel import flash_attention
+  from torch_xla.experimental.custom_kernel import jax_import_guard
+  jax_import_guard()
+  import jax
+  import jax.numpy as jnp
+  from jax.experimental import pallas as pl
+
+
+class PallasTest(unittest.TestCase):
+
+  def _attention(self, q, k, v):
+    attn_weight = q @ k.transpose(-2, -1)
+    attn_weight = nn.functional.softmax(attn_weight, dim=-1)
+    attn_output = attn_weight @ v
+    return attn_output
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_spmd_data_parallel(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    n_devices = xr.global_runtime_device_count()
+    xs.set_global_mesh(xs.Mesh(range(n_devices), (n_devices, 1, 1, 1)))
+
+    q = torch.randn(4, 2, 128, 4).to("xla")
+    k = torch.randn(4, 2, 128, 4).to("xla")
+    v = torch.randn(4, 2, 128, 4).to("xla")
+
+    o = flash_attention(q, k, v, partition_spec=range(n_devices))
+    self.assertEqual(
+        torch_xla._XLAC._get_xla_sharding_spec(o),
+        f"{{devices=[{n_devices},1,1,1]0,1,2,3}}")
+
+    expected_o = self._attention(q, k, v)
+    self.assertTrue(torch.allclose(o.cpu(), expected_o.cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 3,
+                   "This test only works on TPUv3+.")
+  def test_flash_attention_backward_spmd_data_parallel(self):
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.HIGHEST)
+    n_devices = xr.global_runtime_device_count()
+    xs.set_global_mesh(xs.Mesh(range(n_devices), (n_devices, 1, 1, 1)))
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = flash_attention(q, k, v, partition_spec=range(n_devices))
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    q_grad = q.grad
+    k_grad = k.grad
+    v_grad = v.grad
+    self.assertEqual(
+        torch_xla._XLAC._get_xla_sharding_spec(q_grad),
+        f"{{devices=[{n_devices},1,1,1]0,1,2,3}}")
+    self.assertEqual(
+        torch_xla._XLAC._get_xla_sharding_spec(k_grad),
+        f"{{devices=[{n_devices},1,1,1]0,1,2,3}}")
+    self.assertEqual(
+        torch_xla._XLAC._get_xla_sharding_spec(v_grad),
+        f"{{devices=[{n_devices},1,1,1]0,1,2,3}}")
+
+    torch.manual_seed(42)
+    q = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    k = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    v = torch.randn(4, 2, 128, 8, requires_grad=True).to("xla")
+    q.retain_grad()
+    k.retain_grad()
+    v.retain_grad()
+
+    o = self._attention(q, k, v)
+    loss = o.sum()
+    loss.backward()
+    xm.mark_step()
+
+    for i in [(q, q_grad), (k, k_grad), (v, v_grad)]:
+      self.assertTrue(torch.allclose(i[0].grad.cpu(), i[1].cpu(), atol=1e-05))
+    jax.config.update('jax_default_matmul_precision', jax.lax.Precision.DEFAULT)
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  torch.set_default_dtype(torch.float32)
+  torch.manual_seed(42)
+  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
+      use_full_mat_mul_precision=True)
+  xr.use_spmd()
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/tpu/Dockerfile b/test/tpu/Dockerfile
index 6a1a9520b58..8acbdf818f4 100644
--- a/test/tpu/Dockerfile
+++ b/test/tpu/Dockerfile
@@ -3,12 +3,12 @@ FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:tpu as b
 # Replace value with the latest runner release version
 # source: https://github.com/actions/runner/releases
 # ex: 2.303.0
-ARG RUNNER_VERSION="2.314.1"
+ARG RUNNER_VERSION="2.316.1"
 ARG RUNNER_ARCH="x64"
 # Replace value with the latest runner-container-hooks release version
 # source: https://github.com/actions/runner-container-hooks/releases
 # ex: 0.3.1
-ARG RUNNER_CONTAINER_HOOKS_VERSION="0.5.1"
+ARG RUNNER_CONTAINER_HOOKS_VERSION="0.6.0"
 
 ARG USER=runner
 
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
index 413951854d6..fc4024a462c 100755
--- a/test/tpu/run_tests.sh
+++ b/test/tpu/run_tests.sh
@@ -11,15 +11,20 @@ python3 test/spmd/test_xla_distributed_checkpoint.py
 python3 test/spmd/test_train_spmd_linear_model.py
 python3 test/spmd/test_xla_spmd_python_api_interaction.py
 python3 test/spmd/test_xla_auto_sharding.py
-XLA_EXPERIMENTAL=nonzero:masked_select python3 test/ds/test_dynamic_shape_models.py -v
-XLA_EXPERIMENTAL=nonzero:masked_select python3 test/ds/test_dynamic_shapes.py -v
+python3 test/spmd/test_fsdp_v2.py
+XLA_EXPERIMENTAL=nonzero:masked_select:nms python3 test/ds/test_dynamic_shape_models.py -v
+XLA_EXPERIMENTAL=nonzero:masked_select:nms python3 test/ds/test_dynamic_shapes.py -v
 python3 test/test_autocast.py
+python3 test/test_grad_checkpoint.py
 python3 test/dynamo/test_dynamo.py
 python3 test/spmd/test_spmd_debugging.py
 python3 test/pjrt/test_dtypes.py
 python3 test/pjrt/test_dynamic_plugin_tpu.py
 python3 test/test_fori_loop_with_while_loop_simple_add_dispatch_in_torch.py
 python3 test/test_pallas.py
+python3 test/test_pallas_spmd.py
+python3 test/test_input_output_aliases.py
+python3 test/test_gmm.py
 python3 test/torch_distributed/test_torch_distributed_all_gather_xla_backend.py
 python3 test/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py
 python3 test/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py
diff --git a/torch_patches/README.md b/torch_patches/README.md
deleted file mode 100644
index f6476f64ca5..00000000000
--- a/torch_patches/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Guidelines For Patch File Names
-
-Files with extension '.diff' are consider as git patches by apply script.
-
-A file for PyTorch PR _N_ needs to be named 'N.diff'.
-
-Patch files which are not related to PyTorch PRs, should begin with an 'X' character,
-followed by a two digit number, followed by a dash ('-'), a name, and '.diff'.
-Example:
-
-```
-X10-optimizer.diff
-```
-
-Patch file are alphabetically ordered, so PyTorch PR patches are always applied
-before the non PyTorch ones.
-
-
-There's a special file `torch_patches/.torch_pin`, which is used to coordinate landing PRs in
-`pytorch/pytorch` and `pytorch/xla`.
-
-To test a `pytorch/xla` PR against a `pytorch/pytorch` PR or branch,
-put the PR number or branch name in this file.
-Example:
-
-```
-#32451
-# or
-my_awesome_branch # (must live in `pytorch/pytorch`)
-```
-
-In the case where the pytorch/pytorch PR also depends on the pytorch/xla PR, you will also need to update the https://github.com/pytorch/pytorch/blob/main/.github/ci_commit_pins/xla.txt to match the latest hash of your pytorch/xla PR. To be noted, the hash from a PR produced by a fork won't work in this case. Then you need to find someone from the pytorch/xla team to produe a branch PR for you.
diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
index ebc0af6c7ad..eef6e48bac2 100644
--- a/torch_xla/__init__.py
+++ b/torch_xla/__init__.py
@@ -6,6 +6,7 @@
 import torch
 import _XLAC
 from ._internal import tpu
+from .version import __version__
 
 logging.basicConfig()
 logger = logging.getLogger(__name__)
@@ -27,8 +28,6 @@ def _set_missing_flags(flags, sets):
 def _setup_xla_flags():
   flags = os.environ.get('XLA_FLAGS', '').split(' ')
   flags = _set_missing_flags(flags, (('xla_cpu_enable_fast_math', 'false'),))
-  flags = _set_missing_flags(
-      flags, (('xla_gpu_simplify_all_fp_conversions', 'false'),))
   flags = _set_missing_flags(flags,
                              (('xla_gpu_force_compilation_parallelism', '8'),))
   os.environ['XLA_FLAGS'] = ' '.join(flags)
@@ -76,6 +75,8 @@ def _setup_default_env():
 
     os.environ.setdefault('ALLOW_MULTIPLE_LIBTPU_LOAD', '1')
     os.environ.setdefault('TPU_ML_PLATFORM', 'PyTorch/XLA')
+    # This is used for ML Framework Telemetry.
+    os.environ.setdefault('TPU_ML_PLATFORM_VERSION', __version__)
 
     if tpu.version() == 4:
       os.environ.setdefault('TPU_MEGACORE', 'megacore_dense')
@@ -149,7 +150,6 @@ def _setup_tpu_vm_library_path() -> bool:
 
 import atexit
 from ._patched_functions import _apply_patches
-from .version import __version__
 
 _found_libtpu = _setup_tpu_vm_library_path()
 
@@ -186,6 +186,27 @@ def _init_xla_lazy_backend():
 # TODO @wonjoo come up with a long term fix in Dynamo.
 torch._dynamo.config.automatic_dynamic_shapes = False
 
+# Activate view-replay on AOTAutograd.
+# See: https://github.com/pytorch/pytorch/pull/124488
+import torch._functorch.config
+
+torch._functorch.config.view_replay_for_aliased_outputs = True
+
+import importlib.metadata
+import warnings
+
+try:
+  # TensorFlow TPU distribution has the same package name as GPU, but not CPU
+  dist = importlib.metadata.distribution('tensorflow')
+  warnings.warn(
+      "`tensorflow` can conflict with `torch-xla`. Prefer `tensorflow-cpu` when"
+      " using PyTorch/XLA. To silence this warning, `pip uninstall -y "
+      "tensorflow && pip install tensorflow-cpu`. If you are in a notebook "
+      "environment such as Colab or Kaggle, restart your notebook runtime "
+      "afterwards.")
+except importlib.metadata.PackageNotFoundError:
+  pass
+
 from .stablehlo import save_as_stablehlo, save_torch_model_as_stablehlo
 
 from .experimental import plugins
diff --git a/torch_xla/core/dynamo_bridge.py b/torch_xla/core/dynamo_bridge.py
index 624acb9cb6f..f6d7a3b6e00 100644
--- a/torch_xla/core/dynamo_bridge.py
+++ b/torch_xla/core/dynamo_bridge.py
@@ -100,15 +100,7 @@ def __call__(self, args):
 
 
 def get_fallback_ops():
-  fallback_ops = []
-  for opname in metrics.counter_names():
-    if "aten::" not in opname:
-      continue
-    val = int(metrics.counter_value(opname))
-    if val > 0:
-      fallback_ops.append(f"{opname}={val}")
-
-  return fallback_ops
+  return metrics.executed_fallback_ops()
 
 
 # Checks that all input args that are tensors are on the same device.
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
index 7591e13af29..149aa99b67d 100644
--- a/torch_xla/core/xla_model.py
+++ b/torch_xla/core/xla_model.py
@@ -7,7 +7,7 @@
 import threading
 import time
 import warnings
-from typing import List, Optional
+from typing import List, Optional, TypedDict
 import torch
 import torch.distributed._functional_collectives
 from torch.library import Library
@@ -1434,15 +1434,19 @@ def fork_rng(device=None, enabled=True):
     set_rng_state(xla_rng_state, device=device)
 
 
-def get_memory_info(device):
-  """Retrieves the device memory information.
+class MemoryInfo(TypedDict):
+  bytes_used: str
+  bytes_limit: int
+
+
+def get_memory_info(device: torch.device) -> MemoryInfo:
+  """Retrieves the device memory usage.
 
   Args:
-    device (string): The device whose memory information are requested.
+    device: The device whose memory information are requested.
 
   Returns:
-    A dictionary with `kb_free` (free memory in KB) and `kb_total` (total
-    memory in KB) keys.
+    MemoryInfo dict with memory usage for the given device.
   """
   return torch_xla._XLAC._xla_memory_info(str(device))
 
diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
index 2faf483f067..a2aadc0c633 100644
--- a/torch_xla/csrc/BUILD
+++ b/torch_xla/csrc/BUILD
@@ -42,6 +42,7 @@ ptxla_cc_library(
         "cross_replica_reduces.cpp",
         "data_ops.cpp",
         "debug_util.cpp",
+        "dl_convertor.cpp",
         "elementwise.cpp",
         "helpers.cpp",
         "ir_dump_util.cpp",
@@ -81,6 +82,7 @@ ptxla_cc_library(
         "cross_replica_reduces.h",
         "data_ops.h",
         "debug_util.h",
+        "dl_convertor.h",
         "elementwise.h",
         "generated_file_include.h",
         "helpers.h",
diff --git a/torch_xla/csrc/aten_cpu_fallback.cpp b/torch_xla/csrc/aten_cpu_fallback.cpp
index d664c60114f..5e84e61ba1a 100644
--- a/torch_xla/csrc/aten_cpu_fallback.cpp
+++ b/torch_xla/csrc/aten_cpu_fallback.cpp
@@ -16,6 +16,18 @@ namespace torch_xla {
 static std::unordered_map<std::string, ::torch_xla::runtime::metrics::Counter*>
     _cpu_fallback_counters;
 
+// Get all the executed fallback operations.
+// In other words, get all of them whose counters are not zero.
+std::vector<std::string> GetFallbackOperations() {
+  std::vector<std::string> fallback;
+  for (auto const& pair : _cpu_fallback_counters) {
+    if (pair.second->Value() != 0) {
+      fallback.push_back(pair.first);
+    }
+  }
+  return fallback;
+}
+
 void xla_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   XLA_FN_TRACK(3);
   const auto name = c10::toString(op.operator_name());
diff --git a/torch_xla/csrc/aten_cpu_fallback.h b/torch_xla/csrc/aten_cpu_fallback.h
index 572d4e1009a..706c7aa40a5 100644
--- a/torch_xla/csrc/aten_cpu_fallback.h
+++ b/torch_xla/csrc/aten_cpu_fallback.h
@@ -7,6 +7,8 @@ namespace torch_xla {
 
 void xla_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
 
+std::vector<std::string> GetFallbackOperations();
+
 }  // namespace torch_xla
 
-#endif  // XLA_TORCH_XLA_CSRC_ATEN_CPU_FALLBACK_H_
\ No newline at end of file
+#endif  // XLA_TORCH_XLA_CSRC_ATEN_CPU_FALLBACK_H_
diff --git a/torch_xla/csrc/aten_xla_type.cpp b/torch_xla/csrc/aten_xla_type.cpp
index a7ae1c47964..060d52d61d7 100644
--- a/torch_xla/csrc/aten_xla_type.cpp
+++ b/torch_xla/csrc/aten_xla_type.cpp
@@ -29,6 +29,7 @@
 #include "torch_xla/csrc/helpers.h"
 #include "torch_xla/csrc/ops/as_strided.h"
 #include "torch_xla/csrc/ops/as_strided_view_update.h"
+#include "torch_xla/csrc/ops/device_data.h"
 #include "torch_xla/csrc/ops/diagonal_view_update.h"
 #include "torch_xla/csrc/ops/einsum_utilities.h"
 #include "torch_xla/csrc/ops/index_ops.h"
@@ -1290,6 +1291,38 @@ at::Tensor XLANativeFunctions::embedding_dense_backward(
       num_weights, padding_idx, scale_grad_by_freq));
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+XLANativeFunctions::_embedding_bag_forward_only(
+    const at::Tensor& weight, const at::Tensor& indices,
+    const at::Tensor& offsets, bool scale_grad_by_freq, int64_t mode,
+    bool sparse, const c10::optional<at::Tensor>& per_sample_weights,
+    bool include_last_offset, int64_t padding_idx) {
+  TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
+  if (mode == 1 || scale_grad_by_freq || sparse || padding_idx != -1) {
+    return at::native::call_fallback_fn<
+        &xla_cpu_fallback,
+        ATEN_OP(_embedding_bag_forward_only)>::call(weight, indices, offsets,
+                                                    scale_grad_by_freq, mode,
+                                                    sparse, per_sample_weights,
+                                                    include_last_offset,
+                                                    padding_idx);
+  }
+  auto indices_tensor = bridge::GetXlaTensor(indices);
+  auto sample_weights =
+      per_sample_weights.has_value() && per_sample_weights.value().defined()
+          ? bridge::GetXlaTensor(per_sample_weights.value())
+          : tensor_methods::full_like(indices_tensor, 1.0,
+                                      *torch_xla::bridge::GetXlaDevice(weight),
+                                      at::ScalarType::Float);
+  auto result = tensor_methods::embedding_bag(
+      bridge::GetXlaTensor(weight), indices_tensor,
+      bridge::GetXlaTensor(offsets), mode, sample_weights, include_last_offset);
+  return std::make_tuple(bridge::AtenFromXlaTensor(std::get<0>(result)),
+                         bridge::AtenFromXlaTensor(std::get<1>(result)),
+                         bridge::AtenFromXlaTensor(std::get<2>(result)),
+                         bridge::AtenFromXlaTensor(std::get<3>(result)));
+}
+
 at::Tensor XLANativeFunctions::empty_symint(
     at::SymIntArrayRef sym_size, c10::optional<at::ScalarType> dtype,
     c10::optional<at::Layout> layout, c10::optional<at::Device> device,
@@ -1438,9 +1471,18 @@ at::Tensor XLANativeFunctions::full(at::IntArrayRef size,
     return at::native::call_fallback_fn<&xla_cpu_fallback, ATEN_OP(full)>::call(
         size, fill_value, dtype, layout, device, pin_memory);
   }
-  return bridge::AtenFromXlaTensor(tensor_methods::full(
-      absl::Span<const int64_t>(size), fill_value,
-      GetXlaDeviceOrCurrent(device), at::dtype_or_default(dtype)));
+  at::ScalarType intend_dtype;
+  if (dtype || fill_value.isFloatingPoint()) {
+    // Respect the dtype if it is being explictlly passed in.
+    // All python scalar will be passed in as float64 to the backend, but the
+    // default behavior for pytorch is to return a float32 tensor in this case.
+    intend_dtype = at::dtype_or_default(dtype);
+  } else {
+    intend_dtype = fill_value.type();
+  }
+  return bridge::AtenFromXlaTensor(
+      tensor_methods::full(absl::Span<const int64_t>(size), fill_value,
+                           GetXlaDeviceOrCurrent(device), intend_dtype));
 }
 
 at::Tensor XLANativeFunctions::gather(const at::Tensor& self, int64_t dim,
@@ -1462,8 +1504,10 @@ at::Tensor XLANativeFunctions::gelu_backward(const at::Tensor& grad,
                                              const at::Tensor& self,
                                              c10::string_view approximate) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
+  at::ScalarType result_type = at::result_type(grad, self);
   return bridge::AtenFromXlaTensor(tensor_methods::gelu_backward(
-      bridge::GetXlaTensor(grad), bridge::GetXlaTensor(self), approximate));
+      bridge::GetXlaTensor(grad.to(result_type)),
+      bridge::GetXlaTensor(self.to(result_type)), approximate));
 }
 
 at::Tensor XLANativeFunctions::hardtanh(const at::Tensor& self,
@@ -2497,7 +2541,38 @@ void XLANativeFunctions::_propagate_xla_data(const at::Tensor& input,
   // 1) Aid XLA's InputOutputAlias.
   auto input_tensor = bridge::GetXlaTensor(input);
   auto output_tensor = bridge::GetXlaTensor(output);
-  output_tensor->data()->alias_id = input_tensor->GetUniqueId();
+  if (input_tensor->CurrentDataHandle() != nullptr ||
+      (input_tensor->CurrentIrValue().node != nullptr &&
+       torch_xla::DeviceData::Cast(
+           input_tensor->CurrentIrValue().node.get()))) {
+    /*
+    if input has a XLAData or holds a devicedata node, set alias_id to
+    tensor_id. Consider the case.
+
+    // x.tensor_id = 1, x.alias_id = 1
+    x = torch.randn(5,5).to(xla_device())
+    // x.tensor_id = 2, x.alias_id should be 1
+    x += 1
+    xm.mark_step()
+    // x.tensor_id =3, x.alias_id should be 2 since input tensor id will be 2
+    // for this graph
+    x *= 1 of 1
+    */
+    output_tensor->data()->alias_id = input_tensor->GetUniqueId();
+  } else {
+    /*
+    Consider the case
+
+    // x.tensor_id = 1, x.alias_id = 1
+    x = torch.randn(5,5).to(xla_device())
+    // x.tensor_id = 2, x.alias_id should be 1
+    x += 1
+    // x.tensor_id = 3, x.alias_id should still be 1
+    x * = 2
+    xm.mark_step()
+    */
+    output_tensor->data()->alias_id = input_tensor->data()->alias_id;
+  }
 
   // 2) Aid SPMD.
   XLATensor::ShardingSpecPtr sharding = input_tensor->sharding_spec();
@@ -3085,8 +3160,12 @@ at::Tensor XLANativeFunctions::squeeze_copy(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::stack(at::TensorList tensors, int64_t dim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
+  at::ScalarType result_type = at::native::result_type(tensors);
+  std::vector<at::Tensor> c_tensors(tensors.size());
+  std::transform(tensors.begin(), tensors.end(), c_tensors.begin(),
+                 [=](const at::Tensor& t) { return t.to(result_type); });
   return bridge::AtenFromXlaTensor(
-      tensor_methods::stack(bridge::GetXlaTensors(tensors), dim));
+      tensor_methods::stack(bridge::GetXlaTensors(c_tensors), dim));
 }
 
 at::Tensor XLANativeFunctions::std(const at::Tensor& self, bool unbiased) {
@@ -3709,6 +3788,7 @@ at::Tensor XLANativeFunctions::embedding_symint(const at::Tensor& weight,
                                         scale_grad_by_freq, sparse);
   }
 
+  // TODO: We need to make use of the TPU embedding core here eventually.
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(tensor_methods::embedding(
       bridge::GetXlaTensor(weight), bridge::GetXlaTensor(indices)));
diff --git a/torch_xla/csrc/debug_util.cpp b/torch_xla/csrc/debug_util.cpp
index db534dd5292..11843a39a59 100644
--- a/torch_xla/csrc/debug_util.cpp
+++ b/torch_xla/csrc/debug_util.cpp
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <iostream>
 #include <mutex>
+#include <regex>
 #include <sstream>
 #include <unordered_set>
 
@@ -233,11 +234,25 @@ static bool endsWith(const std::string& str, const std::string& suffix) {
          0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 
+int GetDebugLevel() {
+  static const bool pt_xla_debug_enabled =
+      runtime::sys_util::GetEnvBool("PT_XLA_DEBUG", false);
+  static const int pt_xla_debug_level_env =
+      runtime::sys_util::GetEnvInt("PT_XLA_DEBUG_LEVEL", -1);
+  static const int default_debug_level_if_enabled = 100;
+  // default the pt_xla_debug_level to 100 if PT_XLA_DEBUG is set but
+  // PT_XLA_DEBUG_LEVEL is not specified.
+  static const int pt_xla_debug_level =
+      (pt_xla_debug_level_env == -1) && pt_xla_debug_enabled
+          ? default_debug_level_if_enabled
+          : pt_xla_debug_level_env;
+  return pt_xla_debug_level;
+}
+
 void DebugUtil::analyze_graph_execution_python_frame(
     GraphAnalysisSource source, torch::lazy::hash_t graph_hash,
     const xla::ProgramShape* program_shape) {
-  static const bool pt_xla_debug_enabled =
-      runtime::sys_util::GetEnvBool("PT_XLA_DEBUG", false);
+  static const int pt_xla_debug_level = GetDebugLevel();
   static const bool is_master_process =
       (runtime::sys_util::GetEnvInt("PJRT_LOCAL_PROCESS_RANK", 0) == 0);
   static const std::string debug_file_name =
@@ -248,7 +263,12 @@ void DebugUtil::analyze_graph_execution_python_frame(
   static const std::string executation_output_prefix = "Execution Analysis: ";
   static const std::string compilation_output_prefix = "Compilation Analysis: ";
 
-  if (!pt_xla_debug_enabled) {
+  if (pt_xla_debug_level <= 0) {
+    return;
+  }
+
+  if (pt_xla_debug_level <= 1 && source != GraphAnalysisSource::Compilation) {
+    // for debug level <=1, only output compilation analysis in this function.
     return;
   }
 
@@ -355,4 +375,68 @@ void DebugUtil::analyze_graph_execution_python_frame(
   }
 }
 
+void DebugUtil::post_compilation_analysis(
+    runtime::ComputationClient::ComputationPtr computation) {
+  static const int pt_xla_debug_level = GetDebugLevel();
+  static const bool is_master_process =
+      (runtime::sys_util::GetEnvInt("PJRT_LOCAL_PROCESS_RANK", 0) == 0);
+  static const std::string debug_file_name =
+      runtime::sys_util::GetEnvString("PT_XLA_DEBUG_FILE", "");
+  if (pt_xla_debug_level <= 0 || !is_master_process) {
+    return;
+  }
+  static const std::string debug_output_prefix = "Post Compilation Analysis: ";
+  std::stringstream ss;
+  ss << "\n"
+     << debug_output_prefix
+     << "======================================================================"
+        "=========="
+     << "\n";
+  std::string memory_info = computation->get_memory_info();
+
+  std::vector<std::string> keysToExtract = {
+      "generated_code_size_in_bytes", "argument_size_in_bytes",
+      "output_size_in_bytes", "alias_size_in_bytes", "temp_size_in_bytes"};
+  std::vector<std::string> sizes_in_gb;
+
+  for (const std::string& key : keysToExtract) {
+    std::regex pattern(key + "=([0-9]+)");
+    std::smatch match;
+
+    if (std::regex_search(memory_info, match, pattern)) {
+      sizes_in_gb.push_back(
+          std::to_string(std::stoll(match[1]) * 1.0 / 1024 / 1024 / 1024));
+    } else {
+      sizes_in_gb.push_back("Unknown ");
+    }
+  }
+
+  ss << debug_output_prefix << "Graph input size: " << sizes_in_gb[1]
+     << " GB\n";
+  ss << debug_output_prefix << "Graph output size: " << sizes_in_gb[2]
+     << " GB\n";
+  ss << debug_output_prefix << "Aliased Input size: " << sizes_in_gb[3]
+     << " GB\n";
+  ss << debug_output_prefix << "Intermediate tensor size: " << sizes_in_gb[4]
+     << " GB\n";
+  ss << debug_output_prefix << "Compiled program size: " << sizes_in_gb[0]
+     << " GB\n";
+  ss << debug_output_prefix
+     << "----------------------------------------------------------------------"
+        "----------"
+     << "\n";
+  ss << debug_output_prefix
+     << "======================================================================"
+        "=========="
+     << "\n";
+  if (debug_file_name == "") {
+    // print to stderr by default
+    std::cerr << ss.str();
+  } else {
+    std::ofstream outFile;
+    outFile.open(debug_file_name, std::ios_base::app);
+    outFile << ss.rdbuf();
+  }
+}
+
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/debug_util.h b/torch_xla/csrc/debug_util.h
index 7dcc906fb77..516245d8768 100644
--- a/torch_xla/csrc/debug_util.h
+++ b/torch_xla/csrc/debug_util.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "absl/types/span.h"
+#include "torch_xla/csrc/runtime/computation_client.h"
 #include "torch_xla/csrc/tensor.h"
 
 namespace torch_xla {
@@ -60,6 +61,9 @@ class DebugUtil {
   static void analyze_graph_execution_python_frame(
       GraphAnalysisSource source, torch::lazy::hash_t graph_hash = 0,
       const xla::ProgramShape* program_shape = nullptr);
+
+  static void post_compilation_analysis(
+      runtime::ComputationClient::ComputationPtr computation);
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/dl_convertor.cpp b/torch_xla/csrc/dl_convertor.cpp
new file mode 100644
index 00000000000..d29401be8fe
--- /dev/null
+++ b/torch_xla/csrc/dl_convertor.cpp
@@ -0,0 +1,345 @@
+#include "torch_xla/csrc/dl_convertor.h"
+
+#include <ATen/DLConvertor.h>
+
+#include "absl/types/span.h"
+#include "torch_xla/csrc/aten_xla_bridge.h"
+#include "torch_xla/csrc/ops/device_data.h"
+#include "torch_xla/csrc/runtime/computation_client.h"
+#include "torch_xla/csrc/runtime/debug_macros.h"
+#include "torch_xla/csrc/runtime/pjrt_computation_client.h"
+#include "torch_xla/csrc/runtime/runtime.h"
+#include "torch_xla/csrc/runtime/tf_logging.h"
+#include "torch_xla/csrc/tensor.h"
+#include "torch_xla/csrc/tensor_util.h"
+#include "torch_xla/csrc/unwrap_data.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/status.h"
+
+namespace torch_xla {
+
+struct DLPackTensor {
+  ~DLPackTensor();
+  std::unique_ptr<xla::PjRtBuffer::ExternalReference> external_reference;
+  std::shared_ptr<xla::PjRtBuffer> buffer_reference;
+
+  std::vector<int64_t> shape;
+  std::vector<int64_t> strides;
+  DLManagedTensor tensor;
+};
+
+DLPackTensor::~DLPackTensor() {
+  if (external_reference) {
+    external_reference.reset(nullptr);
+  }
+}
+
+void DLPackTensorDeleter(DLManagedTensor* t) {
+  if (t) {
+    delete static_cast<DLPackTensor*>(t->manager_ctx);
+  }
+}
+
+DLDeviceType DLDeviceTypeForDevice(const xla::PjRtDevice& device) {
+  if (device.client()->platform_id() == xla::CpuId()) {
+    return DLDeviceType::kDLCPU;
+  } else if (device.client()->platform_id() == xla::CudaId()) {
+    return DLDeviceType::kDLCUDA;
+  }
+  XLA_ERROR() << "Device " << device.DebugString()
+              << " cannot be used as a DLPack device.";
+}
+
+// Reference: https://github.com/openxla/xla/blob/main/xla/python/dlpack.cc
+DLDevice DLDeviceForDevice(const xla::PjRtDevice& device) {
+  DLDevice dlDevice;
+  dlDevice.device_type = DLDeviceTypeForDevice(device);
+  dlDevice.device_id = device.local_hardware_id();
+  return dlDevice;
+}
+
+// Reference: https://github.com/openxla/xla/blob/main/xla/python/dlpack.cc
+DLDataType PrimitiveTypeToDLDataType(xla::PrimitiveType type) {
+  switch (type) {
+    case xla::PrimitiveType::S8:
+      return DLDataType{kDLInt, 8, 1};
+    case xla::PrimitiveType::S16:
+      return DLDataType{kDLInt, 16, 1};
+    case xla::PrimitiveType::S32:
+      return DLDataType{kDLInt, 32, 1};
+    case xla::PrimitiveType::S64:
+      return DLDataType{kDLInt, 64, 1};
+    case xla::PrimitiveType::U8:
+      return DLDataType{kDLUInt, 8, 1};
+    case xla::PrimitiveType::U16:
+      return DLDataType{kDLUInt, 16, 1};
+    case xla::PrimitiveType::U32:
+      return DLDataType{kDLUInt, 32, 1};
+    case xla::PrimitiveType::U64:
+      return DLDataType{kDLUInt, 64, 1};
+    case xla::PrimitiveType::F16:
+      return DLDataType{kDLFloat, 16, 1};
+    case xla::PrimitiveType::F32:
+      return DLDataType{kDLFloat, 32, 1};
+    case xla::PrimitiveType::F64:
+      return DLDataType{kDLFloat, 64, 1};
+    case xla::PrimitiveType::BF16:
+      return DLDataType{kDLBfloat, 16, 1};
+    case xla::PrimitiveType::PRED:
+      return DLDataType{kDLBool, 8, 1};
+    case xla::PrimitiveType::C64:
+      return DLDataType{kDLComplex, 64, 1};
+    case xla::PrimitiveType::C128:
+      return DLDataType{kDLComplex, 128, 1};
+    default:
+      XLA_ERROR() << "XLA type " << xla::PrimitiveType_Name(type)
+                  << " has no DLPack equivalent";
+  }
+}
+
+std::vector<int64_t> StridesForShape(xla::PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout) {
+  XLA_CHECK_EQ(dimensions.size(), layout.minor_to_major().size());
+  std::vector<int64_t> strides;
+  strides.resize(dimensions.size());
+  int64_t stride = 1;
+  for (int i : layout.minor_to_major()) {
+    strides[i] = stride;
+    stride *= dimensions[i];
+  }
+  return strides;
+}
+
+// Convert an XLA tensor to a dlPack tensor.
+DLManagedTensor* toDLPack(const at::Tensor& input) {
+  std::shared_ptr<runtime::ComputationClient::Data> handle =
+      get_data_handle(input);
+  XLA_CHECK(handle != nullptr)
+      << "Could not extract a valid data handle from the input tensor";
+
+  std::shared_ptr<xla::PjRtBuffer> pjrt_buffer =
+      runtime::GetComputationClient()->GetPjRtBuffer(handle);
+  XLA_CHECK(pjrt_buffer != nullptr) << "Could not get a valid pjrt_buffer";
+
+  XLA_CHECK(!pjrt_buffer->IsTuple())
+      << "Unimplemented. BufferToDLPackManagedTensor is not "
+         "implemented for tuple buffers.";
+  XLA_CHECK(!pjrt_buffer->has_dynamic_dimensions())
+      << "Unimplemented. DynamicShape is not implemented in DLPack.";
+
+  auto pack = std::make_unique<DLPackTensor>();
+  DLTensor& dt = pack->tensor.dl_tensor;
+  {
+    // AcquireExternalReference may block
+    auto external_ref = pjrt_buffer->AcquireExternalReference();
+    XLA_CHECK_OK(external_ref.status());
+    pack->external_reference = std::move(external_ref.value());
+    xla::PjRtFuture<> future = pjrt_buffer->GetReadyFuture();
+    absl::Status status = future.Await();
+    XLA_CHECK_OK(status);
+  }
+  pack->buffer_reference = pjrt_buffer;
+
+  dt.data = pack->external_reference->OpaqueDeviceMemoryDataPointer();
+  pack->tensor.manager_ctx = pack.get();
+  pack->tensor.deleter = DLPackTensorDeleter;
+  dt.device = DLDeviceForDevice(*pjrt_buffer->device());
+  dt.device.device_id = pjrt_buffer->device()->local_hardware_id();
+  dt.ndim = pjrt_buffer->dimensions().size();
+  dt.dtype = PrimitiveTypeToDLDataType(pjrt_buffer->element_type());
+
+  pack->shape = std::vector<int64_t>(pjrt_buffer->dimensions().begin(),
+                                     pjrt_buffer->dimensions().end());
+  xla::Layout xla_layout = xla::GetXlaLayoutUnsafe(pjrt_buffer->layout());
+  pack->strides = StridesForShape(pjrt_buffer->element_type(),
+                                  pjrt_buffer->dimensions(), xla_layout);
+  dt.shape = reinterpret_cast<std::int64_t*>(pack->shape.data());
+  dt.strides = reinterpret_cast<std::int64_t*>(pack->strides.data());
+  dt.byte_offset = 0;
+
+  return &(pack.release()->tensor);
+}
+
+// Reference: https://github.com/openxla/xla/blob/main/xla/python/dlpack.cc
+absl::StatusOr<xla::PjRtDevice*> DeviceForDLDevice(const DLDevice& context) {
+  switch (context.device_type) {
+    case DLDeviceType::kDLCPU:
+      XLA_CHECK_EQ(runtime::GetComputationClient()->GetPlatformID(),
+                   xla::CpuId());
+      return runtime::GetComputationClient()->LookupAddressableDevice(
+          context.device_id);
+    case DLDeviceType::kDLCUDA:
+      XLA_CHECK_EQ(runtime::GetComputationClient()->GetPlatformID(),
+                   xla::CudaId());
+      return runtime::GetComputationClient()->LookupAddressableDevice(
+          context.device_id);
+    default:
+      return tsl::errors::InvalidArgument(
+          "Unknown/unsupported DLPack device type %d", context.device_type);
+  }
+}
+
+// Reference: https://github.com/openxla/xla/blob/main/xla/python/dlpack.cc
+absl::StatusOr<xla::PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type) {
+  if (type.lanes != 1) {
+    return tsl::errors::Unimplemented(
+        "DLPack types with lanes != 1 not implemented, got %d", type.lanes);
+  }
+  switch (type.code) {
+    case kDLBool:
+      switch (type.bits) {
+        case 8:
+          return xla::PrimitiveType::PRED;
+        default:
+          return tsl::errors::Unimplemented(
+              "Only 8-bit DLPack booleans are supported, got %d bits",
+              type.bits);
+      }
+    case kDLInt:
+      switch (type.bits) {
+        case 8:
+          return xla::PrimitiveType::S8;
+        case 16:
+          return xla::PrimitiveType::S16;
+        case 32:
+          return xla::PrimitiveType::S32;
+        case 64:
+          return xla::PrimitiveType::S64;
+        default:
+          return tsl::errors::Unimplemented(
+              "Invalid or unsupported DLPack integer width: %d bits",
+              type.bits);
+      }
+    case kDLUInt:
+      switch (type.bits) {
+        case 8:
+          return xla::PrimitiveType::U8;
+        case 16:
+          return xla::PrimitiveType::U16;
+        case 32:
+          return xla::PrimitiveType::U32;
+        case 64:
+          return xla::PrimitiveType::U64;
+        default:
+          return tsl::errors::Unimplemented(
+              "Invalid or unsupported DLPack unsigned integer width: %d bits",
+              type.bits);
+      }
+    case kDLFloat:
+      switch (type.bits) {
+        case 16:
+          return xla::PrimitiveType::F16;
+        case 32:
+          return xla::PrimitiveType::F32;
+        case 64:
+          return xla::PrimitiveType::F64;
+        default:
+          return tsl::errors::Unimplemented(
+              "Invalid or unsupported DLPack float width: %d bits", type.bits);
+      }
+    case kDLBfloat:
+      switch (type.bits) {
+        case 16:
+          return xla::PrimitiveType::BF16;
+        default:
+          return tsl::errors::Unimplemented(
+              "Invalid or unsupported DLPack Bfloat width: %d bits", type.bits);
+      }
+    case kDLComplex:
+      switch (type.bits) {
+        case 64:
+          return xla::PrimitiveType::C64;
+        case 128:
+          return xla::PrimitiveType::C128;
+        default:
+          return tsl::errors::Unimplemented(
+              "Invalid or unsupported DLPack complex width: %d bits",
+              type.bits);
+      }
+    default:
+      return tsl::errors::Unimplemented(
+          "Unknown or invalid DLPack type code %d", type.code);
+  }
+}
+
+// Reference: https://github.com/openxla/xla/blob/main/xla/python/dlpack.cc
+absl::StatusOr<std::vector<int64_t>> StridesToLayout(
+    absl::Span<int64_t const> dims, absl::Span<int64_t const> strides) {
+  XLA_CHECK_EQ(dims.size(), strides.size());
+  std::vector<int64_t> minor_to_major(dims.size());
+  std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
+  absl::c_sort(minor_to_major, [&](int a, int b) {
+    if (strides[a] < strides[b]) {
+      return true;
+    }
+    if (strides[a] > strides[b]) {
+      return false;
+    }
+    // If two dimensions have the same stride, prefer the major-to-minor
+    // interpretation of the ordering, since that's what JAX wants.
+    return b < a;
+  });
+  int64_t stride = 1;
+  for (int64_t d : minor_to_major) {
+    if (dims[d] > 1 && strides[d] != stride) {
+      return tsl::errors::Unimplemented(
+          "Only DLPack tensors with trivial (compact) striding are supported; "
+          "i.e., tensors whose striding represents a transposition of the "
+          "underlying buffer but not broadcasting. Dimensions were: [%s], "
+          "strides were [%s].",
+          absl::StrJoin(dims, ","), absl::StrJoin(strides, ","));
+    }
+    stride *= dims[d];
+  }
+  return minor_to_major;
+}
+
+at::Tensor fromDLPack(DLManagedTensor* dlmt) {
+  XLA_CHECK(dlmt->dl_tensor.ndim >= 0)
+      << "Number of dimensions in DLManagedTensor must be nonnegative, got "
+      << dlmt->dl_tensor.ndim;
+  xla::PjRtDevice* device = DeviceForDLDevice(dlmt->dl_tensor.device).value();
+  absl::Span<int64_t const> dimensions(
+      const_cast<int64_t*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
+  xla::PrimitiveType element_type =
+      DLDataTypeToPrimitiveType(dlmt->dl_tensor.dtype).value();
+
+  std::vector<int64_t> minor_to_major;
+  if (dlmt->dl_tensor.strides &&
+      absl::c_find(dimensions, 0) == dimensions.end()) {
+    absl::Span<int64_t const> strides(
+        const_cast<int64_t*>(dlmt->dl_tensor.strides), dlmt->dl_tensor.ndim);
+    minor_to_major = StridesToLayout(dimensions, strides).value();
+  } else {
+    minor_to_major.resize(dlmt->dl_tensor.ndim);
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
+  }
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      element_type, dimensions, minor_to_major);
+
+  std::function<void()> on_delete_callback;
+  if (dlmt->deleter) {
+    on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
+  }
+  xla::StatusOr<std::unique_ptr<xla::PjRtBuffer>> pjrt_buffer =
+      device->client()->CreateViewOfDeviceBuffer(
+          static_cast<char*>(dlmt->dl_tensor.data) +
+              dlmt->dl_tensor.byte_offset,
+          shape, device, on_delete_callback);
+  XLA_CHECK_OK(pjrt_buffer.status()) << "Failed to create a pjrt buffer.";
+  XLA_CHECK(pjrt_buffer.value() != nullptr) << "pjrt buffer is null.";
+
+  runtime::ComputationClient::DataPtr data =
+      runtime::PjRtComputationClient::CreateData(
+          runtime::GetComputationClient()->PjRtDeviceToString(device), shape,
+          std::move(pjrt_buffer.value()));
+
+  at::ScalarType tensor_type = at::toScalarType(dlmt->dl_tensor.dtype);
+  XLATensorPtr xla_tensor = XLATensor::Create(data, tensor_type);
+  return bridge::AtenFromXlaTensor(xla_tensor);
+}
+
+}  // namespace torch_xla
diff --git a/torch_xla/csrc/dl_convertor.h b/torch_xla/csrc/dl_convertor.h
new file mode 100644
index 00000000000..f5a54823e2e
--- /dev/null
+++ b/torch_xla/csrc/dl_convertor.h
@@ -0,0 +1,14 @@
+#ifndef XLA_TORCH_XLA_CSRC_DL_CONVERTOR_H_
+#define XLA_TORCH_XLA_CSRC_DL_CONVERTOR_H_
+
+#include <ATen/Tensor.h>
+#include <ATen/dlpack.h>
+
+namespace torch_xla {
+
+DLManagedTensor* toDLPack(const at::Tensor& src);
+at::Tensor fromDLPack(DLManagedTensor* src);
+
+}  // namespace torch_xla
+
+#endif  // XLA_TORCH_XLA_CSRC_DL_CONVERTOR_H_
diff --git a/torch_xla/csrc/dtype.cpp b/torch_xla/csrc/dtype.cpp
index e26e4e27fe9..c5310a9e2ea 100644
--- a/torch_xla/csrc/dtype.cpp
+++ b/torch_xla/csrc/dtype.cpp
@@ -10,6 +10,9 @@ namespace {
 bool ShouldUseBF16() {
   bool use_bf16 = runtime::sys_util::GetEnvBool("XLA_USE_BF16", false);
   if (use_bf16) {
+    std::cout
+        << "XLA_USE_BF16 will be deprecated after the 2.4 release, please "
+           "convert your model to bf16 directly\n";
     TF_LOG(INFO) << "Using BF16 data type for floating point values";
   }
   return use_bf16;
@@ -18,6 +21,9 @@ bool ShouldUseBF16() {
 bool ShouldUseF16() {
   bool use_fp16 = runtime::sys_util::GetEnvBool("XLA_USE_FP16", false);
   if (use_fp16) {
+    std::cout
+        << "XLA_USE_FP16 will be deprecated after the 2.4 release, please "
+           "convert your model to fp16 directly\n";
     TF_LOG(INFO) << "Using F16 data type for floating point values";
   }
   return use_fp16;
@@ -27,6 +33,9 @@ bool ShouldDowncastToBF16() {
   bool downcast_bf16 =
       runtime::sys_util::GetEnvBool("XLA_DOWNCAST_BF16", false);
   if (downcast_bf16) {
+    std::cout
+        << "XLA_DOWNCAST_BF16 will be deprecated after the 2.4 release, please "
+           "downcast your model directly\n";
     TF_LOG(INFO) << "Downcasting floating point values, F64->F32, F32->BF16";
   }
   return downcast_bf16;
@@ -36,6 +45,9 @@ bool ShouldDowncastToF16() {
   bool downcast_fp16 =
       runtime::sys_util::GetEnvBool("XLA_DOWNCAST_FP16", false);
   if (downcast_fp16) {
+    std::cout
+        << "XLA_DOWNCAST_FP16 will be deprecated after the 2.4 release, please "
+           "downcast your model directly\n";
     TF_LOG(INFO) << "Downcasting floating point values, F64->F32, F32->FP16";
   }
   return downcast_fp16;
@@ -45,6 +57,8 @@ bool ShouldUse32BitLong() {
   bool use_32bit_long =
       runtime::sys_util::GetEnvBool("XLA_USE_32BIT_LONG", false);
   if (use_32bit_long) {
+    std::cout
+        << "XLA_USE_32BIT_LONG will be deprecated after the 2.4 release\n";
     TF_LOG(INFO) << "Using 32bit integers for kLong values";
   }
   return use_32bit_long;
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index e20e28fbb8f..362df87e08d 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -1,3 +1,4 @@
+#include <ATen/dlpack.h>
 #include <Python.h>
 #include <c10/core/Device.h>
 #include <c10/util/Optional.h>
@@ -32,8 +33,10 @@
 #include "pybind11/stl_bind.h"
 #include "torch_xla/csrc/XLANativeFunctions.h"
 #include "torch_xla/csrc/aten_autograd_ops.h"
+#include "torch_xla/csrc/aten_cpu_fallback.h"
 #include "torch_xla/csrc/aten_xla_bridge.h"
 #include "torch_xla/csrc/device.h"
+#include "torch_xla/csrc/dl_convertor.h"
 #include "torch_xla/csrc/dtype.h"
 #include "torch_xla/csrc/helpers.h"
 #include "torch_xla/csrc/ir.h"
@@ -725,8 +728,8 @@ py::dict GetMemoryInfo(const std::string& device_str) {
         runtime::GetComputationClient()->GetMemoryInfo(device.toString());
   }
   auto py_dict = py::dict();
-  py_dict["kb_free"] = mem_info.kb_free;
-  py_dict["kb_total"] = mem_info.kb_total;
+  py_dict["bytes_used"] = mem_info.bytes_used;
+  py_dict["bytes_limit"] = mem_info.bytes_limit;
   return py_dict;
 }
 
@@ -1115,6 +1118,36 @@ void BuildLoweringContextSubmodule(py::module* m) {
       .def("get_name_string", &PyLoweringContext::GetNameString);
 }
 
+// Used in the to_dlpack.
+void dlPack_Capsule_Destructor(PyObject* data) {
+  if (!PyCapsule_IsValid(data, "dltensor")) {
+    return;
+  }
+  DLManagedTensor* dlMTensor =
+      static_cast<DLManagedTensor*>(PyCapsule_GetPointer(data, "dltensor"));
+  if (dlMTensor) {
+    dlMTensor->deleter(dlMTensor);
+  } else {
+    // The tensor has been deleted. Clear any error from
+    // PyCapsule_GetPointer.
+    PyErr_Clear();
+  }
+}
+
+at::Tensor tensor_fromDLPack(PyObject* data) {
+  DLManagedTensor* dlMTensor =
+      (DLManagedTensor*)PyCapsule_GetPointer(data, "dltensor");
+  XLA_CHECK(dlMTensor != nullptr)
+      << "from_dlpack received an invalid capsule. Note that a DLTensor "
+         "capsule can be consumed only once. You may have already constructed "
+         "a tensor from it once.";
+
+  at::Tensor tensor = torch_xla::fromDLPack(dlMTensor);
+  PyCapsule_SetName(data, "used_dltensor");
+  PyCapsule_SetDestructor(data, nullptr);
+  return tensor;
+}
+
 void InitXlaModuleBindings(py::module m) {
   m.def("_prepare_to_exit", []() { PrepareToExit(); });
   m.def("_xla_runtime_is_initialized", []() {
@@ -1280,6 +1313,9 @@ void InitXlaModuleBindings(py::module m) {
       return runtime::GetComputationClient()->GetLocalDevices();
     }
   });
+  m.def("_get_stream_for_cuda_device", [](const int device_id) {
+    return runtime::GetComputationClient()->GetCudaStreamForDevice(device_id);
+  });
   m.def("_xla_num_devices", []() -> int64_t {
     if (UseVirtualDevice()) {
       return 1;
@@ -1766,6 +1802,7 @@ void InitXlaModuleBindings(py::module m) {
         }
       },
       py::arg("devices"));
+  m.def("_get_executed_fallback_ops", []() { return GetFallbackOperations(); });
   m.def("_xla_counter_names", []() {
     auto counter_names = torch::lazy::GetCounterNames();
     auto xla_counter_names = runtime::metrics::GetCounterNames();
@@ -1842,9 +1879,8 @@ void InitXlaModuleBindings(py::module m) {
         return GetLiveTensorsReport(nodes_threshold, device);
       },
       py::arg("nodes_threshold") = 100, py::arg("device") = "");
-  m.def("_xla_memory_info", [](const std::string& device) -> py::object {
-    return GetMemoryInfo(device);
-  });
+  m.def("_xla_memory_info",
+        [](const std::string& device) { return GetMemoryInfo(device); });
   m.def(
       "_xla_set_use_full_mat_mul_precision",
       [](bool use_full_mat_mul_precision) {
@@ -2042,6 +2078,16 @@ void InitXlaModuleBindings(py::module m) {
     XLATensorPtr xtensor = bridge::GetXlaTensor(input);
     return GetXLAShardingSpec(xtensor);
   });
+  m.def("_get_xla_op_sharding",
+        [](const at::Tensor& input) -> std::optional<xla::OpSharding> {
+          XLATensorPtr xtensor = bridge::GetXlaTensor(input);
+          XLATensor::ShardingSpecPtr sharding_spec =
+              xtensor ? xtensor->sharding_spec() : nullptr;
+          if (sharding_spec != nullptr) {
+            return sharding_spec->sharding;
+          }
+          return std::nullopt;
+        });
   m.def("_get_xla_sharding_specs",
         [](const std::vector<at::Tensor>& tensors) -> std::vector<std::string> {
           tsl::profiler::TraceMe activity("_get_xla_sharding_specs",
@@ -2339,6 +2385,24 @@ void InitXlaModuleBindings(py::module m) {
         [](at::Tensor& self, const at::Tensor& source) -> at::Tensor& {
           return XLANativeFunctions::set_(self, source);
         });
+  m.def("_xla_custom_call",
+        [](const std::vector<at::Tensor>& inputs, const std::string& target,
+           const std::vector<std::vector<int64_t>>& output_shapes,
+           const std::vector<py::object>& output_dtypes, bool has_side_effect,
+           const std::string& backend_config,
+           const int api_version) -> std::vector<at::Tensor> {
+          std::vector<at::ScalarType> dtypes;
+          dtypes.reserve(output_dtypes.size());
+          for (auto& dtype : output_dtypes) {
+            dtypes.push_back(
+                reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
+          }
+
+          auto xtensors = tensor_methods::custom_call(
+              bridge::GetXlaTensors(inputs), target, output_shapes, dtypes,
+              has_side_effect, backend_config, api_version);
+          return bridge::AtenFromXlaTensors(std::move(xtensors));
+        });
   m.def("_xla_tpu_custom_call",
         [](const std::vector<at::Tensor>& inputs, const std::string& payload,
            const std::vector<std::vector<int64_t>>& output_shapes,
@@ -2482,6 +2546,54 @@ void InitXlaModuleBindings(py::module m) {
     return false;
   });
 
+  m.def("_unsafe_buffer_pointer",
+        [](const at::Tensor& input) -> std::uintptr_t {
+          XLATensorPtr xtensor = bridge::GetXlaTensor(input);
+          XLA_CHECK(xtensor) << "The input is not an XLA tensor.";
+          if (xtensor->CurrentDataHandle() != nullptr) {
+            std::shared_ptr<runtime::ComputationClient::Data> data =
+                std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
+                    xtensor->CurrentDataHandle());
+            return runtime::GetComputationClient()->UnsafeBufferPointer(data);
+          } else if (xtensor->CurrentIrValue().node != nullptr) {
+            DeviceData* device_data =
+                DeviceData::Cast(xtensor->CurrentIrValue().node.get());
+            if (device_data != nullptr) {
+              torch::lazy::BackendDataPtr data = device_data->data();
+              return runtime::GetComputationClient()->UnsafeBufferPointer(
+                  UnwrapXlaData(data));
+            } else {
+              XLA_ERROR() << "Could not get the buffer pointer for XLATensor "
+                             "with IR that's not DeviceData";
+            }
+          }
+          XLA_ERROR() << "Could not get the buffer pointer for XLATensor "
+                         "without a data handle or an IR.";
+        });
+
+  // from an XLA tensor to a dlpack tensor.
+  // If ext_data is the result of an CUDA computation, we should synchronize
+  // (waits for all kernels in all streams on a CUDA device to complete) if the
+  // current stream is different from the ext_data's stream. Otherwise, we may
+  // risk of getting incorrect results.
+  m.def("_to_dlpack", [](const at::Tensor& input) -> py::handle {
+    DLManagedTensor* dlMTensor;
+    {
+      NoGilSection nogil;
+      dlMTensor = torch_xla::toDLPack(input);
+    }
+    return PyCapsule_New(dlMTensor, "dltensor", dlPack_Capsule_Destructor);
+  });
+
+  // from a dlpack tensor to an XLA tensor
+  // If ext_data is the result of an CUDA computation, we should synchronize
+  // (waits for all kernels in all streams on a CUDA device to complete) if the
+  // current stream is different from the ext_data's stream. Otherwise, we may
+  // risk of getting incorrect results.
+  m.def("_from_dlpack", [](py::handle ext_data) -> at::Tensor {
+    return tensor_fromDLPack(ext_data.ptr());
+  });
+
   // -------------Dynamo Integration API Start-------------------------
   /*
    * Return tensor ids and at::tensors for all DeviceData nodes that is needed
diff --git a/torch_xla/csrc/ops/custom_call.cpp b/torch_xla/csrc/ops/custom_call.cpp
new file mode 100644
index 00000000000..00347e0c975
--- /dev/null
+++ b/torch_xla/csrc/ops/custom_call.cpp
@@ -0,0 +1,70 @@
+#include "torch_xla/csrc/ops/custom_call.h"
+
+#include <torch/csrc/lazy/core/tensor_util.h>
+
+#include "torch_xla/csrc/lowering_context.h"
+#include "torch_xla/csrc/ops/xla_ops.h"
+#include "torch_xla/csrc/shape_helper.h"
+
+namespace torch_xla {
+
+CustomCall::CustomCall(torch::lazy::OpList inputs,
+                       const std::string& call_target, xla::Shape output_shape,
+                       bool has_side_effect, const std::string& backend_config,
+                       const int api_version)
+    : XlaNode(xla_custom_call, inputs, std::move(output_shape),
+              /*num_outputs=*/output_shape.tuple_shapes_size(),
+              torch::lazy::MHash(call_target)),
+      call_target_(call_target),
+      has_side_effect_(has_side_effect),
+      backend_config_(backend_config),
+      api_version_(api_version) {}
+
+torch::lazy::NodePtr CustomCall::Clone(torch::lazy::OpList operands) const {
+  return torch::lazy::MakeNode<CustomCall>(operands, call_target_,
+                                           this->xla_shape(), has_side_effect_,
+                                           backend_config_, api_version_);
+}
+
+XlaOpVector CustomCall::Lower(LoweringContext* loctx) const {
+  std::vector<xla::XlaOp> inputs;
+  inputs.reserve(this->operands().size());
+  for (auto& operand : operands()) {
+    inputs.push_back(loctx->GetOutputOp(operand));
+  }
+  xla::Shape output_shape = this->xla_shape();
+  const int n_outputs = output_shape.tuple_shapes_size();
+  if (n_outputs == 1) {
+    output_shape = output_shape.tuple_shapes(0);
+  }
+  XLA_CHECK(api_version_ >= 0 && api_version_ < 5);
+  xla::XlaOp output = xla::CustomCall(
+      inputs[0].builder(), call_target_, inputs, output_shape,
+      /*opaque=*/backend_config_,
+      /*has_side_effect=*/has_side_effect_,
+      /*output_operand_aliasing=*/{},
+      /*literal=*/nullptr,
+      /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/static_cast<xla::CustomCallApiVersion>(api_version_));
+  std::vector<xla::XlaOp> result;
+  if (n_outputs == 1) {
+    result = {output};
+  } else {
+    result.reserve(n_outputs);
+    for (int i = 0; i < n_outputs; ++i) {
+      result.push_back(xla::GetTupleElement(output, i));
+    }
+  }
+  return ReturnOps(result, loctx);
+}
+
+std::string CustomCall::ToString() const {
+  std::stringstream ss;
+  ss << XlaNode::ToString() << ", call_target=" << call_target_
+     << ", has_side_effect=" << has_side_effect_
+     << ", backend_config=" << backend_config_
+     << ", api_version=" << api_version_;
+  return ss.str();
+}
+
+}  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/custom_call.h b/torch_xla/csrc/ops/custom_call.h
new file mode 100644
index 00000000000..69bb613d4b6
--- /dev/null
+++ b/torch_xla/csrc/ops/custom_call.h
@@ -0,0 +1,29 @@
+#ifndef XLA_TORCH_XLA_CSRC_OPS_CUSTOM_CALL_H_
+#define XLA_TORCH_XLA_CSRC_OPS_CUSTOM_CALL_H_
+
+#include "torch_xla/csrc/ir.h"
+
+namespace torch_xla {
+
+class CustomCall : public XlaNode {
+ public:
+  CustomCall(torch::lazy::OpList inputs, const std::string& call_target,
+             xla::Shape output_shape, bool has_side_effect,
+             const std::string& backend_config, const int api_version);
+
+  std::string ToString() const override;
+
+  torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
+
+  XlaOpVector Lower(LoweringContext* loctx) const override;
+
+ private:
+  std::string call_target_;
+  bool has_side_effect_;
+  std::string backend_config_;
+  int api_version_;
+};
+
+}  // namespace torch_xla
+
+#endif  // XLA_TORCH_XLA_CSRC_OPS_CUSTOM_CALL_H_
diff --git a/torch_xla/csrc/ops/embedding_bag.cpp b/torch_xla/csrc/ops/embedding_bag.cpp
new file mode 100644
index 00000000000..d2bb034a005
--- /dev/null
+++ b/torch_xla/csrc/ops/embedding_bag.cpp
@@ -0,0 +1,192 @@
+#include "torch_xla/csrc/ops/embedding_bag.h"
+
+#include "torch_xla/csrc/helpers.h"
+#include "torch_xla/csrc/lowering_context.h"
+#include "torch_xla/csrc/ops/infer_output_shape.h"
+#include "torch_xla/csrc/ops/xla_ops.h"
+#include "torch_xla/csrc/shape_helper.h"
+#include "torch_xla/csrc/xla_lower_util.h"
+#include "tsl/platform/stacktrace.h"
+#include "xla/client/lib/constants.h"
+#include "xla/client/lib/loops.h"
+#include "xla/client/lib/slicing.h"
+#include "xla/shape_util.h"
+
+namespace torch_xla {
+namespace {
+const int MODE_SUM = 0;
+const int MODE_MEAN = 1;
+const int MODE_MAX = 2;
+std::vector<xla::XlaOp> BuildEmbeddingBag(xla::XlaOp weight, xla::XlaOp indices,
+                                          xla::XlaOp offsets,
+                                          xla::XlaOp per_sample_weights,
+                                          bool include_last_offset, int mode) {
+  xla::Shape offset_shape = ShapeHelper::ShapeOfXlaOp(offsets);
+  int64_t n = offset_shape.dimensions(0);
+  xla::Shape weight_shape = ShapeHelper::ShapeOfXlaOp(weight);
+  int64_t weight_dim = weight_shape.dimensions(1);
+  xla::Shape indices_shape = ShapeHelper::ShapeOfXlaOp(indices);
+  int64_t num_embeddings = indices_shape.dimensions(0);
+  XLA_CHECK(indices_shape.rank() == 1 || indices_shape.rank() == 2)
+      << "input has to be a 1D or 2D Tensor, but got Tensor of dimension "
+      << indices_shape.rank();
+  if (indices_shape.rank() == 1) {
+    XLA_CHECK(offset_shape.rank() == 1)
+        << "offsets has to be a 1D Tensor, but got Tensor of dimension "
+        << offset_shape.rank();
+  }
+  XLA_CHECK(weight_shape.rank() == 2)
+      << "weight has to be a 2D Tensor, but got Tensor of dimension "
+      << weight_shape.rank();
+
+  xla::XlaOp output2 = xla::ZerosLike(indices);
+  xla::XlaOp output3 = xla::ZerosLike(offsets);
+  std::vector<int64_t> sizes = {n, weight_dim};
+  xla::XlaOp output4 =
+      xla::Zeros(offsets.builder(),
+                 xla::ShapeUtil::MakeShape(offset_shape.element_type(), sizes));
+
+  xla::XlaOp embeddings = xla::TorchIndexSelect(weight, indices, 0);
+  xla::XlaOp embeddings_weighted = xla::Mul(
+      embeddings, xla::ConvertElementType(
+                      xla::BroadcastInDim(per_sample_weights,
+                                          {num_embeddings, weight_dim}, {0}),
+                      weight_shape.element_type()));
+
+  std::vector<xla::Shape> shape_elements = {
+      xla::ShapeUtil::MakeShape(offset_shape.element_type(), {}),
+      xla::ShapeUtil::MakeShape(offset_shape.element_type(), {}),
+      xla::ShapeUtil::MakeShape(weight_shape.element_type(),
+                                {num_embeddings, weight_dim}),
+      xla::ShapeUtil::MakeShape(weight_shape.element_type(), {1, weight_dim})};
+  xla::Shape result_shape = xla::ShapeUtil::MakeTupleShape(shape_elements);
+
+  xla::XlaComputation condition;
+  {
+    xla::XlaBuilder builder("condition");
+    auto prev = xla::Parameter(&builder, 0, result_shape, "prev");
+    auto index = xla::GetTupleElement(prev, 0);
+    auto final_value = xla::GetTupleElement(prev, 1);
+    xla::Lt(index, final_value);
+    condition = builder.Build().value();
+  }
+
+  xla::XlaComputation body;
+  {
+    xla::XlaBuilder builder("body");
+    auto prev = xla::Parameter(&builder, 0, result_shape, "prev");
+    auto index = xla::GetTupleElement(prev, 0);
+    auto emb = xla::GetTupleElement(prev, 2);
+    auto w = xla::GetTupleElement(prev, 3);
+
+    xla::XlaOp slice = xla::DynamicSlice(
+        emb,
+        {index, xla::ConvertElementType(xla::ConstantR0<int64_t>(&builder, 0),
+                                        offset_shape.element_type())},
+        {1, weight_dim});
+    xla::XlaOp result =
+        mode == MODE_SUM ? xla::Add(w, slice) : xla::Max(w, slice);
+
+    xla::Tuple(&builder,
+               {
+                   xla::Add(index, xla::ConvertElementType(
+                                       xla::ConstantR0<int64_t>(&builder, 1),
+                                       offset_shape.element_type())),
+                   xla::GetTupleElement(prev, 1),
+                   xla::GetTupleElement(prev, 2),
+                   result,
+               });
+    body = builder.Build().value();
+  }
+
+  xla::Array<float> initial_vector({1, weight_dim}, 0.f);
+  std::vector<xla::XlaOp> results;
+  for (int64_t i = 0; i < n; i++) {
+    xla::XlaOp start = xla::DynamicSlice(
+        offsets, {xla::ConstantR0<int64_t>(offsets.builder(), i)}, {1});
+    if (i == n - 1 && include_last_offset) continue;
+    xla::XlaOp end =
+        i == n - 1 && !include_last_offset
+            ? xla::ConvertElementType(xla::ConstantR1<int64_t>(
+                                          offsets.builder(), 1, num_embeddings),
+                                      offset_shape.element_type())
+            : xla::DynamicSlice(
+                  offsets, {xla::ConstantR0<int64_t>(offsets.builder(), i + 1)},
+                  {1});
+    // Create a While node with computations for the condition and the body.
+    auto init_tuple = xla::Tuple(
+        offsets.builder(),
+        {xla::Reshape(start, {0}, {}), xla::Reshape(end, {0}, {}),
+         embeddings_weighted,
+         xla::ConvertElementType(
+             xla::ConstantFromArray<float>(offsets.builder(), initial_vector),
+             weight_shape.element_type())});
+    auto result = xla::While(condition, body, init_tuple);
+    results.push_back(xla::GetTupleElement(result, 3));
+  };
+  xla::XlaOp output1 = xla::ConcatInDim(offsets.builder(), results, 0);
+  return {output1, output2, output3, output4};
+}
+
+xla::Shape NodeOutputShapes(const torch::lazy::Value& weight,
+                            const torch::lazy::Value& indices,
+                            const torch::lazy::Value& offsets,
+                            const torch::lazy::Value& per_sample_weights,
+                            bool include_last_offset, bool mode) {
+  auto lower_for_shapes_fn =
+      [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
+    return xla::Tuple(
+        operands[0].builder(),
+        BuildEmbeddingBag(operands[0], operands[1], operands[2], operands[3],
+                          include_last_offset, mode));
+  };
+
+  std::vector<xla::Shape> input_shapes = {
+      GetXlaShape(weight), GetXlaShape(indices), GetXlaShape(offsets),
+      GetXlaShape(per_sample_weights)};
+
+  return InferOutputShape(absl::MakeSpan(input_shapes), lower_for_shapes_fn);
+}
+}  // namespace
+
+std::string EmbeddingBag::ToString() const {
+  std::stringstream ss;
+  ss << XlaNode::ToString();
+  return ss.str();
+}
+
+EmbeddingBag::EmbeddingBag(const torch::lazy::Value& weight,
+                           const torch::lazy::Value& indices,
+                           const torch::lazy::Value& offsets, int64_t mode,
+                           const torch::lazy::Value& per_sample_weights,
+                           bool include_last_offset)
+    : XlaNode(
+          torch::lazy::OpKind(at::aten::embedding_bag),
+          {weight, indices, offsets, per_sample_weights},
+          [&]() {
+            return NodeOutputShapes(weight, indices, offsets,
+                                    per_sample_weights, include_last_offset,
+                                    mode);
+          },
+          /*num_outputs=*/4, torch::lazy::MHash(mode, include_last_offset)),
+      mode_(mode),
+      include_last_offset_(include_last_offset) {}
+
+torch::lazy::NodePtr EmbeddingBag::Clone(torch::lazy::OpList operands) const {
+  return torch::lazy::MakeNode<EmbeddingBag>(operands.at(0), operands.at(1),
+                                             operands.at(2), mode_,
+                                             operands.at(3), false);
+}
+
+XlaOpVector EmbeddingBag::Lower(LoweringContext* loctx) const {
+  xla::XlaOp weight = loctx->GetOutputOp(operand(0));
+  xla::XlaOp indices = loctx->GetOutputOp(operand(1));
+  xla::XlaOp offsets = loctx->GetOutputOp(operand(2));
+  xla::XlaOp per_sample_weights = loctx->GetOutputOp(operand(3));
+  std::vector<xla::XlaOp> ops =
+      BuildEmbeddingBag(weight, indices, offsets, per_sample_weights,
+                        include_last_offset_, mode_);
+  return ReturnOps(absl::MakeSpan(ops), loctx);
+}
+
+}  // namespace torch_xla
\ No newline at end of file
diff --git a/torch_xla/csrc/ops/embedding_bag.h b/torch_xla/csrc/ops/embedding_bag.h
new file mode 100644
index 00000000000..4d9b0a6eecb
--- /dev/null
+++ b/torch_xla/csrc/ops/embedding_bag.h
@@ -0,0 +1,31 @@
+#ifndef XLA_TORCH_XLA_CSRC_OPS_EMBEDDING_BAG_H_
+#define XLA_TORCH_XLA_CSRC_OPS_EMBEDDING_BAG_H_
+
+#include <vector>
+
+#include "torch_xla/csrc/ir.h"
+
+namespace torch_xla {
+
+class EmbeddingBag : public XlaNode {
+ public:
+  EmbeddingBag(const torch::lazy::Value& weight,
+               const torch::lazy::Value& indices,
+               const torch::lazy::Value& offsets, int64_t mode,
+               const torch::lazy::Value& per_sample_weights,
+               bool include_last_offset);
+
+  std::string ToString() const override;
+
+  torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
+
+  XlaOpVector Lower(LoweringContext* loctx) const override;
+
+ private:
+  int64_t mode_;
+  bool include_last_offset_;
+};
+
+}  // namespace torch_xla
+
+#endif  // XLA_TORCH_XLA_CSRC_OPS_EMBEDDING_BAG_H_
\ No newline at end of file
diff --git a/torch_xla/csrc/ops/index_ops.cpp b/torch_xla/csrc/ops/index_ops.cpp
index ddbf0c677aa..ccc3d090a56 100644
--- a/torch_xla/csrc/ops/index_ops.cpp
+++ b/torch_xla/csrc/ops/index_ops.cpp
@@ -277,12 +277,59 @@ torch::lazy::Value EnsureRank1(const torch::lazy::Value& index) {
              : index;
 }
 
+bool HasZeroElementIndex(absl::Span<const XLATensorPtr> indices) {
+  return std::any_of(indices.begin(), indices.end(),
+                     [](const XLATensorPtr& index) {
+                       return xla::ShapeUtil::ElementsIn(*index->shape()) == 0;
+                     });
+}
+
+XLATensorPtr GetZeroElementTensor(const XLATensorPtr& base,
+                                  absl::Span<const XLATensorPtr> indices,
+                                  int64_t start_dim) {
+  // Returns a 0-element tensor described by the indexing.
+  //
+  // At this point, we know that we are indexing 'base' with 0-element
+  // tensors, i.e. one of its dimensions has size 0. Therefore, we
+  // need to return a 0-element tensor of the appropriate size.
+  //
+  // This function computes the output size and calls 'full' to create the
+  // desired 0-element tensor.
+  std::vector<int64_t> dimensions;
+
+  // In the beginning, we add all dimensions that come before the ones that
+  // correspond to the indices.
+  absl::Span<const int64_t> base_dimensions = base->shape().get().dimensions();
+  dimensions.insert(dimensions.end(), base_dimensions.begin(),
+                    base_dimensions.begin() + start_dim);
+
+  // Then, we add the dimensions of the first index. Notice that, at this
+  // point, all indices are already broadcasted, i.e. have the same size.
+  // So, we grab the first one for convenience.
+  for (auto dim : indices.front()->shape().get().dimensions()) {
+    dimensions.push_back(dim);
+  }
+
+  // Finally, add the remaining dimensions that weren't indexed.
+  dimensions.insert(dimensions.end(),
+                    base_dimensions.begin() + start_dim + indices.size(),
+                    base_dimensions.end());
+
+  return tensor_methods::full(dimensions, 0, base->GetDevice(), base->dtype());
+}
+
 XLATensorPtr IndexByTensors(const XLATensorPtr& base,
                             absl::Span<const XLATensorPtr> indices,
                             int64_t start_dim) {
   if (indices.empty()) {
     return base;
   }
+  // Check whether we are trying to index with a 0-element tensor.
+  // If so, there's no need to compute anything. We simply return
+  // a 0-element tensor.
+  if (HasZeroElementIndex(indices)) {
+    return GetZeroElementTensor(base, indices, start_dim);
+  }
   auto canonical_indices = WrapIndicesOnce(base, indices, start_dim);
   int64_t indices_rank = canonical_indices.front()->shape().get().rank();
   // Stack the indices to allow the whole multi-indexing to be dispatched with a
diff --git a/torch_xla/csrc/ops/xla_ops.cpp b/torch_xla/csrc/ops/xla_ops.cpp
index 97ae036b774..e1aa70d56d6 100644
--- a/torch_xla/csrc/ops/xla_ops.cpp
+++ b/torch_xla/csrc/ops/xla_ops.cpp
@@ -9,6 +9,7 @@ const OpKindWrapper xla_as_strided_view_update("xla::as_strided_view_update");
 const OpKindWrapper xla_cast("xla::cast");
 const OpKindWrapper xla_collective_permute("xla::collective_permute");
 const OpKindWrapper xla_cross_replica_sum("xla::cross_replica_sum");
+const OpKindWrapper xla_custom_call("xla::custom_call");
 const OpKindWrapper xla_device_data("xla::device_data");
 const OpKindWrapper xla_dequantize_tensor("xla::dequantize_tensor");
 const OpKindWrapper xla_diagonal_view_update("xla::diagonal_view_update");
diff --git a/torch_xla/csrc/ops/xla_ops.h b/torch_xla/csrc/ops/xla_ops.h
index 7a56d99743b..fff50fe6bc3 100644
--- a/torch_xla/csrc/ops/xla_ops.h
+++ b/torch_xla/csrc/ops/xla_ops.h
@@ -35,6 +35,7 @@ extern const OpKindWrapper xla_as_strided_view_update;
 extern const OpKindWrapper xla_cast;
 extern const OpKindWrapper xla_collective_permute;
 extern const OpKindWrapper xla_cross_replica_sum;
+extern const OpKindWrapper xla_custom_call;
 extern const OpKindWrapper xla_device_data;
 extern const OpKindWrapper xla_dequantize_tensor;
 extern const OpKindWrapper xla_diagonal_view_update;
diff --git a/torch_xla/csrc/reduction.cpp b/torch_xla/csrc/reduction.cpp
index f7b29d1cf3b..56702e79279 100644
--- a/torch_xla/csrc/reduction.cpp
+++ b/torch_xla/csrc/reduction.cpp
@@ -459,9 +459,9 @@ xla::XlaOp BuildArgMin(xla::XlaOp input, int64_t dim, bool keepdim) {
       shape = &ShapeHelper::ShapeOfXlaOp(operand);
     }
   }
-  xla::XlaOp result = xla::ArgMin(
+  xla::XlaOp result = xla::ArgMinMax(
       operand, GetXlaPrimitiveTypeForCurrentDevice(xla::PrimitiveType::S64),
-      dim);
+      dim, /* is_min */ true);
   if (keepdim) {
     auto dimensions = torch::lazy::ToVector<int64_t>(shape->dimensions());
     if (dim_is_none) {
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
index 6f746972355..582b69d8a50 100644
--- a/torch_xla/csrc/runtime/BUILD
+++ b/torch_xla/csrc/runtime/BUILD
@@ -237,7 +237,7 @@ cc_library(
     deps = [
         ":debug_macros",
         ":sys_util",
-        "@tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
+        "@xla//xla/tsl/distributed_runtime/preemption:preemption_sync_manager",
         "@xla//xla/pjrt/distributed",
     ],
 )
diff --git a/torch_xla/csrc/runtime/cache.h b/torch_xla/csrc/runtime/cache.h
index bef5b099ec6..9557b2353b7 100644
--- a/torch_xla/csrc/runtime/cache.h
+++ b/torch_xla/csrc/runtime/cache.h
@@ -173,6 +173,7 @@ class PersistentCache : public AbstractCache<K, T, H, E> {
       TORCH_LAZY_COUNTER("PersistentCacheMiss", 1);
       return nullptr;
     }
+    TORCH_LAZY_TIMED("PersistentCacheLoad");
     std::stringstream ss;
     std::ifstream in(path, std::ios::binary);
     ss << in.rdbuf();
diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h
index 33b48255baf..a66ae2a7fa4 100644
--- a/torch_xla/csrc/runtime/computation_client.h
+++ b/torch_xla/csrc/runtime/computation_client.h
@@ -27,6 +27,8 @@
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/types.h"
 
 namespace torch_xla {
@@ -191,6 +193,10 @@ class ComputationClient {
       return module->ToString();
     }
 
+    virtual const std::string get_memory_info() const {
+      XLA_ERROR() << "Unimplemented";
+    }
+
    private:
     xla::XlaComputation computation_;
     xla::ProgramShape program_shape_;
@@ -246,8 +252,8 @@ class ComputationClient {
   struct ExecuteReplicatedOptions : public ClientExecuteOptions {};
 
   struct MemoryInfo {
-    int64_t kb_free = 0;
-    int64_t kb_total = 0;
+    int64_t bytes_used = 0;
+    int64_t bytes_limit = 0;
   };
 
   virtual ~ComputationClient() {}
@@ -275,6 +281,9 @@ class ComputationClient {
   // structure will be empty if there is no sharding, like with PjRtData.
   virtual std::optional<xla::OpSharding> GetDataSharding(DataPtr handle) = 0;
 
+  virtual std::string PjRtDeviceToString(
+      xla::PjRtDevice* const device) const = 0;
+
   // Transfers local tensor values to the TPU devices and fetches the handles.
   virtual std::vector<DataPtr> TransferToDevice(
       absl::Span<const std::shared_ptr<const TensorSource>> tensors) = 0;
@@ -302,6 +311,11 @@ class ComputationClient {
   virtual std::vector<xla::Literal> TransferFromDevice(
       absl::Span<const DataPtr> handles) = 0;
 
+  virtual std::uintptr_t UnsafeBufferPointer(const DataPtr handle) = 0;
+
+  virtual std::shared_ptr<xla::PjRtBuffer> GetPjRtBuffer(
+      const DataPtr handle) = 0;
+
   // Compiles a set of computations.
   virtual std::vector<ComputationPtr> Compile(
       std::vector<CompileInstance> instances) = 0;
@@ -342,6 +356,13 @@ class ComputationClient {
 
   virtual torch_xla::DeviceType GetDeviceType() const = 0;
 
+  virtual xla::PjRtPlatformId GetPlatformID() const = 0;
+
+  virtual absl::StatusOr<xla::PjRtDevice*> LookupAddressableDevice(
+      int local_device_id) const = 0;
+
+  virtual std::intptr_t GetCudaStreamForDevice(int local_device_id) const = 0;
+
   virtual size_t GetNumDevices() const = 0;
 
   virtual std::vector<std::string> GetLocalDevices() const = 0;
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.cc b/torch_xla/csrc/runtime/ifrt_computation_client.cc
index 029f9268342..e15cd238c0d 100644
--- a/torch_xla/csrc/runtime/ifrt_computation_client.cc
+++ b/torch_xla/csrc/runtime/ifrt_computation_client.cc
@@ -58,18 +58,6 @@ torch::lazy::hash_t hash_comp_env(
     xla::ifrt::Client* client,
     std::vector<xla::ifrt::Device*>& ordered_devices) {
   torch::lazy::hash_t hash = hash::HashXlaEnvVars();
-  auto topology_desc = client->GetTopologyForDevices(ordered_devices);
-  if (topology_desc.ok()) {
-    // Some backends support a topology description which provides a better
-    // view of the specific compilation environment.
-    auto serialized = topology_desc.value()->Serialize();
-    if (serialized.ok()) {
-      return torch::lazy::HashCombine(
-          hash,
-          torch::lazy::DataHash(serialized->data(), serialized->length()));
-    }
-    // If serialization fails, fallthrough to the manual approach.
-  }
   std::string platform_name(client->platform_name());
   std::string platform_version(client->platform_version());
   hash = torch::lazy::HashCombine(
@@ -78,10 +66,26 @@ torch::lazy::hash_t hash_comp_env(
   hash = torch::lazy::HashCombine(
       hash, torch::lazy::StringHash(platform_version.c_str()));
   // Include global devices in the hash, ensuring order is consistent.
+  xla::ifrt::DeviceList::Devices ifrt_devices;
   for (auto& device : ordered_devices) {
     std::string device_str(device->ToString());
     hash = torch::lazy::HashCombine(
         hash, torch::lazy::StringHash(device_str.c_str()));
+    ifrt_devices.push_back(device);
+  }
+
+  xla::ifrt::DeviceList device_list(std::move(ifrt_devices));
+  auto topology_desc = client->GetTopologyForDevices(device_list);
+  if (topology_desc.ok()) {
+    // Some backends support a topology description which provides a better
+    // view of the specific compilation environment.
+    auto serialized = topology_desc.value()->Serialize();
+    if (serialized.ok()) {
+      return torch::lazy::HashCombine(
+          hash,
+          torch::lazy::DataHash(serialized->data(), serialized->length()));
+    }
+    // If serialization fails, fallthrough to the manual approach.
   }
   return hash;
 }
@@ -92,7 +96,7 @@ std::string IfrtComputationClient::IfrtDeviceToString(
     xla::ifrt::Device* const device) const {
   std::string platform =
       absl::AsciiStrToUpper(device->client()->platform_name());
-  int ordinal = global_ordinals_.at(device->id());
+  int ordinal = global_ordinals_.at(device->Id().value());
   std::string str = absl::StrFormat("%s:%d", platform, ordinal);
   return str;
 }
@@ -120,11 +124,12 @@ IfrtComputationClient::IfrtComputationClient() {
   // a device's global ordinal separately from its device ID. Order the
   // devices by increasing ID to assign global ordinals.
   std::vector<xla::ifrt::Device*> ordered_devices(client_->device_count());
-  std::partial_sort_copy(client_->devices().begin(), client_->devices().end(),
-                         ordered_devices.begin(), ordered_devices.end(),
-                         [](auto& a, auto& b) { return a->id() < b->id(); });
+  std::partial_sort_copy(
+      client_->devices().begin(), client_->devices().end(),
+      ordered_devices.begin(), ordered_devices.end(),
+      [](auto& a, auto& b) { return a->Id().value() < b->Id().value(); });
   for (auto* device : ordered_devices) {
-    global_ordinals_[device->id()] = global_ordinals_.size();
+    global_ordinals_[device->Id().value()] = global_ordinals_.size();
     std::string device_str = IfrtDeviceToString(device);
     string_to_device_.emplace(device_str, device);
   }
@@ -392,6 +397,16 @@ tsl::RCReference<xla::ifrt::Array> IfrtComputationClient::ReplicateShardedData(
   return *replicated_output;
 }
 
+std::uintptr_t IfrtComputationClient::UnsafeBufferPointer(
+    const DataPtr handle) {
+  XLA_ERROR() << __FUNCTION__ << " not implemented";
+}
+
+std::shared_ptr<xla::PjRtBuffer> IfrtComputationClient::GetPjRtBuffer(
+    const DataPtr handle) {
+  XLA_ERROR() << __FUNCTION__ << " not implemented";
+}
+
 std::vector<xla::Literal> IfrtComputationClient::TransferFromDevice(
     absl::Span<const DataPtr> handles) {
   metrics::TimedSection timed(TransferFromDeviceMetric());
@@ -470,7 +485,7 @@ std::vector<ComputationClient::ComputationPtr> IfrtComputationClient::Compile(
                                      &mlir_module);
     std::unique_ptr<xla::ifrt::LoadedExecutable> executable =
         ConsumeValue(client_->GetDefaultCompiler()->Compile(
-            std::make_unique<xla::ifrt::XlaProgram>(std::move(mlir_module)),
+            std::make_unique<xla::ifrt::HloProgram>(std::move(mlir_module)),
             std::make_unique<xla::ifrt::XlaCompileOptions>(compile_options)));
     StableHloCompileCounter()->AddValue(1);
 
@@ -611,7 +626,7 @@ std::vector<std::string> IfrtComputationClient::GetAllDevices() const {
 int IfrtComputationClient::GetNumProcesses() const {
   int max_process_index = client_->process_index();
   for (auto* device : client_->devices()) {
-    max_process_index = std::max(max_process_index, device->process_index());
+    max_process_index = std::max(max_process_index, device->ProcessIndex());
   }
 
   return max_process_index + 1;
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.h b/torch_xla/csrc/runtime/ifrt_computation_client.h
index d6d914ad8da..59664d045e8 100644
--- a/torch_xla/csrc/runtime/ifrt_computation_client.h
+++ b/torch_xla/csrc/runtime/ifrt_computation_client.h
@@ -16,6 +16,7 @@
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
@@ -54,6 +55,10 @@ class IfrtComputationClient : public ComputationClient {
   std::vector<xla::Literal> TransferFromDevice(
       absl::Span<const DataPtr> handles) override;
 
+  std::uintptr_t UnsafeBufferPointer(const DataPtr handle) override;
+
+  std::shared_ptr<xla::PjRtBuffer> GetPjRtBuffer(const DataPtr handle) override;
+
   DataPtr TransferShardsToDevice(
       absl::Span<const std::shared_ptr<const TensorSource>> tensor_shards,
       std::string device, xla::Shape shape, xla::OpSharding sharding) override;
@@ -82,6 +87,19 @@ class IfrtComputationClient : public ComputationClient {
         absl::AsciiStrToUpper(client_->platform_name()));
   };
 
+  xla::PjRtPlatformId GetPlatformID() const override {
+    return client_->platform_id();
+  }
+
+  absl::StatusOr<xla::PjRtDevice*> LookupAddressableDevice(
+      int local_device_id) const override {
+    XLA_ERROR() << __FUNCTION__ << " not implemented";
+  }
+
+  std::intptr_t GetCudaStreamForDevice(int local_device_id) const override {
+    XLA_ERROR() << __FUNCTION__ << " not implemented";
+  }
+
   std::vector<std::string> GetLocalDevices() const override;
 
   std::vector<std::string> GetAllDevices() const override;
@@ -119,6 +137,10 @@ class IfrtComputationClient : public ComputationClient {
     XLA_ERROR() << __FUNCTION__ << " not implemented";
   };
 
+  std::string PjRtDeviceToString(xla::PjRtDevice* const device) const override {
+    XLA_ERROR() << __FUNCTION__ << " not implemented";
+  }
+
   std::string SerializeComputation(const ComputationPtr computation) override {
     XLA_ERROR() << __FUNCTION__ << " not implemented";
   }
@@ -134,7 +156,7 @@ class IfrtComputationClient : public ComputationClient {
   // global_ordinals_ tracks a map from PjRtDeviceId to the device's
   // dense global ordinal.
   std::unordered_map<int, int> global_ordinals_;
-  std::unordered_map<std::string, xla::PjRtDevice* const> string_to_device_;
+  std::unordered_map<std::string, xla::ifrt::Device* const> string_to_device_;
   std::shared_ptr<std::vector<std::string>> replication_devices_;
   OperationManager operation_manager_;
   tsl::thread::ThreadPool pool_ = tsl::thread::ThreadPool(
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cc b/torch_xla/csrc/runtime/pjrt_computation_client.cc
index 188e26f8ac2..55089014152 100644
--- a/torch_xla/csrc/runtime/pjrt_computation_client.cc
+++ b/torch_xla/csrc/runtime/pjrt_computation_client.cc
@@ -8,7 +8,6 @@
 #include "absl/strings/ascii.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
-#include "pjrt_computation_client.h"
 #include "torch_xla/csrc/runtime/computation_client.h"
 #include "torch_xla/csrc/runtime/debug_macros.h"
 #include "torch_xla/csrc/runtime/env_hash.h"
@@ -185,6 +184,13 @@ ComputationClient::DataPtr PjRtComputationClient::CreateDataPlaceholder(
   return std::make_shared<PjRtData>(std::move(device), std::move(shape));
 }
 
+ComputationClient::DataPtr PjRtComputationClient::CreateData(
+    std::string device, xla::Shape shape,
+    std::shared_ptr<xla::PjRtBuffer> pjrt_buffer) {
+  return std::make_shared<PjRtData>(std::move(device), std::move(shape),
+                                    pjrt_buffer);
+}
+
 std::vector<ComputationClient::DataPtr> PjRtComputationClient::GetDataShards(
     ComputationClient::DataPtr data) {
   tsl::profiler::TraceMe activity("PjRtComputationClient::GetDataShards",
@@ -458,12 +464,42 @@ std::vector<ComputationClient::DataPtr> PjRtComputationClient::ReshardData(
   return resharded_results;
 }
 
+std::uintptr_t PjRtComputationClient::UnsafeBufferPointer(
+    const DataPtr handle) {
+  std::shared_ptr<PjRtData> pjrt_data =
+      std::dynamic_pointer_cast<PjRtData>(handle);
+  XLA_CHECK(pjrt_data) << "handle must be PjRtData, got " << handle->ToString();
+  XLA_CHECK(pjrt_data->buffer != nullptr)
+      << "PjRt buffer is null in " << __FUNCTION__;
+  xla::StatusOr<std::uintptr_t> ptr =
+      client_->UnsafeBufferPointer(pjrt_data->buffer.get());
+  XLA_CHECK(ptr.ok());
+  return ptr.value();
+}
+
+std::shared_ptr<xla::PjRtBuffer> PjRtComputationClient::GetPjRtBuffer(
+    const DataPtr handle) {
+  std::shared_ptr<PjRtData> pjrt_data =
+      std::dynamic_pointer_cast<PjRtData>(handle);
+
+  XLA_CHECK(pjrt_data) << "handle must be PjRtData, got " << handle->ToString();
+  std::shared_ptr<xla::PjRtBuffer> pjrt_buffer = pjrt_data->buffer;
+  if (pjrt_buffer != nullptr) {
+    return pjrt_buffer;
+  } else {
+    TF_VLOG(3) << "The pjrt buffer is null so we need to wait for device ops "
+                  "to finish.";
+    WaitDeviceOps({});
+    return std::dynamic_pointer_cast<PjRtData>(handle)->buffer;
+  }
+}
+
 std::vector<xla::Literal> PjRtComputationClient::TransferFromDevice(
     absl::Span<const DataPtr> handles) {
   metrics::TimedSection timed(TransferFromDeviceMetric());
   tsl::profiler::TraceMe activity("PjRtComputationClient::TransferFromDevice",
                                   tsl::profiler::TraceMeLevel::kInfo);
-  std::vector<xla::PjRtFuture<absl::Status>> futures;
+  std::vector<xla::PjRtFuture<>> futures;
   futures.reserve(handles.size());
   std::vector<xla::Literal> literals;
   literals.reserve(handles.size());
@@ -472,7 +508,9 @@ std::vector<xla::Literal> PjRtComputationClient::TransferFromDevice(
     // Use XLA replication to reassemble the sharded data. If input handle
     // is not sharded, then it is a no-op.
     std::shared_ptr<PjRtData> pjrt_data = ReplicateShardedData(handle);
-    XLA_CHECK(pjrt_data);
+    XLA_CHECK(pjrt_data) << "PjRt_data is null in " << __FUNCTION__;
+    XLA_CHECK(pjrt_data->buffer != nullptr)
+        << "PjRt buffer is null in " << __FUNCTION__;
 
     xla::Literal& literal =
         literals.emplace_back(host_output_shape(pjrt_data->buffer.get()));
@@ -482,7 +520,8 @@ std::vector<xla::Literal> PjRtComputationClient::TransferFromDevice(
   }
   for (auto& future : futures) {
     absl::Status status = future.Await();
-    XLA_CHECK_OK(status);
+    XLA_CHECK_OK(status) << "Failed to await future from buffer to literal in"
+                         << __FUNCTION__;
   }
   InboundDataMetric()->AddSample(total_size);
 
@@ -580,6 +619,14 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
           client_->Compile(instance.computation, compile_options).value();
     }
 
+    auto memory_stats_status_or = executable->GetCompiledMemoryStats();
+    if (memory_stats_status_or.ok()) {
+      xla::CompiledMemoryStats memory_stats = memory_stats_status_or.value();
+      TF_VLOG(3) << "memory usage detail = " << memory_stats.DebugString();
+    } else {
+      TF_VLOG(3) << "memory usage is not availiable";
+    }
+
     const auto& hlo_modules = ConsumeValue(executable->GetHloModules());
     xla::HloComputation* hlo_computation = hlo_modules[0]->entry_computation();
     std::shared_ptr<PjRtComputation> pjrt_computation =
@@ -679,7 +726,7 @@ PjRtComputationClient::ExecuteComputation(
   TF_VLOG(5) << "ExecuteComputation acquiring PJRT device lock for " << device
              << " Done";
 
-  std::optional<xla::PjRtFuture<xla::Status>> returned_future;
+  std::optional<xla::PjRtFuture<>> returned_future;
   std::vector<std::unique_ptr<xla::PjRtBuffer>> results =
       pjrt_computation.executable
           ->ExecuteSharded(buffers, pjrt_device, execute_options,
@@ -779,8 +826,8 @@ PjRtComputationClient::ExecuteReplicated(
   TF_VLOG(5) << "ExecuteReplicated acquiring PJRT device lock for "
              << spmd_device_str << " Done";
 
-  std::optional<std::vector<xla::PjRtFuture<xla::Status>>> returned_futures =
-      std::vector<xla::PjRtFuture<xla::Status>>();
+  std::optional<std::vector<xla::PjRtFuture<>>> returned_futures =
+      std::vector<xla::PjRtFuture<>>();
   std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>> results;
   {
     tsl::profiler::TraceMe activity(
@@ -915,5 +962,19 @@ std::map<std::string, Metric> PjRtComputationClient::GetMetrics() const {
   return {};
 }
 
+ComputationClient::MemoryInfo PjRtComputationClient::GetMemoryInfo(
+    const std::string& device) {
+  XLA_CHECK_NE(device, spmd_device_str)
+      << "MemoryInfo not supported for SPMD virtual device.";
+  xla::PjRtDevice* pjrt_device =
+      PjRtComputationClient::StringToPjRtDevice(device);
+  tsl::AllocatorStats stats = pjrt_device->GetAllocatorStats().value();
+
+  return {
+      stats.bytes_in_use,
+      *stats.bytes_limit,
+  };
+}
+
 }  // namespace runtime
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.h b/torch_xla/csrc/runtime/pjrt_computation_client.h
index 9a911c0139b..1d31107e6b9 100644
--- a/torch_xla/csrc/runtime/pjrt_computation_client.h
+++ b/torch_xla/csrc/runtime/pjrt_computation_client.h
@@ -32,6 +32,9 @@ class PjRtComputationClient : public ComputationClient {
       std::string device, xla::Shape shape,
       std::optional<xla::OpSharding> sharding = std::nullopt) override;
 
+  static DataPtr CreateData(std::string device, xla::Shape shape,
+                            std::shared_ptr<xla::PjRtBuffer> pjrt_buffer);
+
   std::vector<DataPtr> GetDataShards(DataPtr data) override;
 
   DataPtr GetDataShard(DataPtr data, size_t index) override;
@@ -55,6 +58,10 @@ class PjRtComputationClient : public ComputationClient {
   std::vector<xla::Literal> TransferFromDevice(
       absl::Span<const DataPtr> handles) override;
 
+  std::uintptr_t UnsafeBufferPointer(const DataPtr handle) override;
+
+  std::shared_ptr<xla::PjRtBuffer> GetPjRtBuffer(const DataPtr handle) override;
+
   DataPtr TransferShardsToDevice(
       absl::Span<const std::shared_ptr<const TensorSource>> tensor_shards,
       std::string device, xla::Shape shape, xla::OpSharding sharding) override;
@@ -87,6 +94,27 @@ class PjRtComputationClient : public ComputationClient {
         absl::AsciiStrToUpper(client_->platform_name()));
   };
 
+  xla::PjRtPlatformId GetPlatformID() const override {
+    return client_->platform_id();
+  }
+
+  absl::StatusOr<xla::PjRtDevice*> LookupAddressableDevice(
+      int local_device_id) const override {
+    return client_->LookupAddressableDevice(
+        xla::PjRtLocalDeviceId(local_device_id));
+  }
+
+  std::intptr_t GetCudaStreamForDevice(int local_device_id) const override {
+    absl::StatusOr<xla::PjRtDevice*> pjrt_device =
+        client_->LookupAddressableDevice(
+            xla::PjRtLocalDeviceId(local_device_id));
+    XLA_CHECK(pjrt_device.ok()) << "Failed to get a PjRt device.";
+    absl::StatusOr<std::intptr_t> stream =
+        pjrt_device.value()->GetStreamForExternalReadyEvents();
+    XLA_CHECK(stream.ok()) << "Failed to get a stream.";
+    return stream.value();
+  }
+
   std::vector<std::string> GetLocalDevices() const override;
 
   std::vector<std::string> GetAllDevices() const override;
@@ -118,11 +146,11 @@ class PjRtComputationClient : public ComputationClient {
 
   bool CoordinatorInitialized() const override;
 
-  // NOT IMPLEMENTED
+  MemoryInfo GetMemoryInfo(const std::string& device) override;
 
-  MemoryInfo GetMemoryInfo(const std::string& device) override {
-    XLA_ERROR() << __FUNCTION__ << " not implemented";
-  };
+  std::string PjRtDeviceToString(xla::PjRtDevice* const device) const override;
+  std::vector<std::string> PjRtDevicesToString(
+      absl::Span<xla::PjRtDevice* const> devices) const;
 
  private:
   std::unique_ptr<xla::PjRtClient> client_;
@@ -139,10 +167,6 @@ class PjRtComputationClient : public ComputationClient {
 
   xla::PjRtDevice* StringToPjRtDevice(const std::string& device);
 
-  std::string PjRtDeviceToString(xla::PjRtDevice* const device) const;
-  std::vector<std::string> PjRtDevicesToString(
-      absl::Span<xla::PjRtDevice* const> devices) const;
-
   struct PjRtData : public Data {
     PjRtData(std::string device, xla::Shape device_shape)
         : Data(std::move(device), std::move(device_shape)) {}
@@ -261,6 +285,15 @@ class PjRtComputationClient : public ComputationClient {
       output_shardings_ = this->executable->GetOutputShardings();
     }
 
+    const std::string get_memory_info() const override {
+      auto memory_stats_status_or = executable->GetCompiledMemoryStats();
+      if (memory_stats_status_or.ok()) {
+        return memory_stats_status_or.value().DebugString();
+      } else {
+        return "memory usage is not availiable";
+      }
+    }
+
     std::unique_ptr<xla::PjRtLoadedExecutable> executable;
     std::optional<std::vector<xla::OpSharding>> output_shardings_;
   };
diff --git a/torch_xla/csrc/runtime/pjrt_registry.cc b/torch_xla/csrc/runtime/pjrt_registry.cc
index 648076757be..52b06d89cb4 100644
--- a/torch_xla/csrc/runtime/pjrt_registry.cc
+++ b/torch_xla/csrc/runtime/pjrt_registry.cc
@@ -21,8 +21,24 @@ namespace runtime {
 
 namespace {
 
+// Placeholder plugin for testing only. Does not implement multiprocessing or
+// configuration. Very likely will not work from Python code.
+class LibraryPlugin : public PjRtPlugin {
+ public:
+  std::string library_path() const override {
+    return sys_util::GetEnvString("PJRT_LIBRARY_PATH", "");
+  }
+
+  const std::unordered_map<std::string, xla::PjRtValueType>
+  client_create_options() const override {
+    return {};
+  }
+
+  bool requires_xla_coordinator() const override { return false; }
+};
+
 std::unordered_map<std::string, std::shared_ptr<const PjRtPlugin>>
-    pjrt_plugins_;
+    pjrt_plugins_ = {{"LIBRARY", std::make_shared<LibraryPlugin>()}};
 
 xla::GpuAllocatorConfig GetGpuAllocatorConfig() {
   auto allocator_config = xla::GpuAllocatorConfig{};
@@ -60,7 +76,8 @@ InitializePjRt(const std::string& device_type) {
   std::unique_ptr<xla::PjRtClient> client;
   std::unique_ptr<XlaCoordinator> coordinator;
 
-  if (sys_util::GetEnvBool(env::kEnvPjrtDynamicPlugins, false)) {
+  if (sys_util::GetEnvBool(env::kEnvPjrtDynamicPlugins, false) &&
+      device_type != "CPU") {
     std::shared_ptr<const PjRtPlugin> plugin = GetPjRtPlugin(device_type);
     if (plugin) {
       TF_VLOG(1) << "Initializing client for PjRt plugin " << device_type;
diff --git a/torch_xla/csrc/runtime/stablehlo_composite_helper.cc b/torch_xla/csrc/runtime/stablehlo_composite_helper.cc
index 3bc9e9cc309..5035fb221a0 100644
--- a/torch_xla/csrc/runtime/stablehlo_composite_helper.cc
+++ b/torch_xla/csrc/runtime/stablehlo_composite_helper.cc
@@ -7,9 +7,9 @@
 
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 #include "single_include/nlohmann/json.hpp"
 #include "stablehlo/dialect/StablehloOps.h"
 
diff --git a/torch_xla/csrc/runtime/xla_coordinator.h b/torch_xla/csrc/runtime/xla_coordinator.h
index ae85c79a941..fb2cfaf99f5 100644
--- a/torch_xla/csrc/runtime/xla_coordinator.h
+++ b/torch_xla/csrc/runtime/xla_coordinator.h
@@ -3,8 +3,8 @@
 
 #include <memory>
 
-#include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 #include "xla/pjrt/distributed/distributed.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 namespace torch_xla {
 namespace runtime {
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
index f27dc786fb5..7baa951c9a6 100644
--- a/torch_xla/csrc/tensor_methods.cpp
+++ b/torch_xla/csrc/tensor_methods.cpp
@@ -1,5 +1,6 @@
 #include "torch_xla/csrc/tensor_methods.h"
 
+#include <ATen/OpMathType.h>
 #include <ATen/core/Reduction.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/lazy/core/helpers.h>
@@ -40,6 +41,7 @@
 #include "torch_xla/csrc/ops/count_nonzero.h"
 #include "torch_xla/csrc/ops/cumprod.h"
 #include "torch_xla/csrc/ops/cumsum.h"
+#include "torch_xla/csrc/ops/custom_call.h"
 #include "torch_xla/csrc/ops/dequant_tensor.h"
 #include "torch_xla/csrc/ops/device_data.h"
 #include "torch_xla/csrc/ops/diagonal.h"
@@ -48,6 +50,7 @@
 #include "torch_xla/csrc/ops/dynamic_view.h"
 #include "torch_xla/csrc/ops/einsum.h"
 #include "torch_xla/csrc/ops/einsum_backward.h"
+#include "torch_xla/csrc/ops/embedding_bag.h"
 #include "torch_xla/csrc/ops/expand.h"
 #include "torch_xla/csrc/ops/expand_symint.h"
 #include "torch_xla/csrc/ops/exponential.h"
@@ -519,6 +522,41 @@ std::pair<XLATensorPtr, torch::lazy::Value> collective_permute(
           torch::lazy::Value(node, 1)};
 }
 
+std::vector<XLATensorPtr> custom_call(
+    const std::vector<XLATensorPtr>& inputs, const std::string& target,
+    const std::vector<std::vector<int64_t>>& output_shapes,
+    const std::vector<at::ScalarType>& output_dtypes, bool has_side_effect,
+    const std::string& backend_config, const int api_version) {
+  XLA_CHECK(inputs.size() > 0) << "inputs are empty";
+
+  std::vector<torch::lazy::Value> values;
+  values.reserve(inputs.size());
+  for (const auto& input : inputs) {
+    values.push_back(input->GetIrValue());
+  }
+
+  XLA_CHECK_EQ(output_shapes.size(), output_dtypes.size());
+  std::vector<xla::Shape> output_xla_shapes;
+  output_xla_shapes.reserve(output_shapes.size());
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    output_xla_shapes.push_back(xla::ShapeUtil::MakeShape(
+        MakeXlaPrimitiveType(output_dtypes[i], &(inputs[0]->GetDevice())),
+        output_shapes[i]));
+  }
+
+  auto node = torch::lazy::MakeNode<CustomCall>(
+      values, target, xla::ShapeUtil::MakeTupleShape(output_xla_shapes),
+      has_side_effect, backend_config, api_version);
+
+  std::vector<XLATensorPtr> outputs;
+  outputs.reserve(output_shapes.size());
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    outputs.push_back(
+        inputs[0]->CreateFrom(torch::lazy::Value(node, i), output_dtypes[i]));
+  }
+  return outputs;
+}
+
 void custom_sharding_(
     const XLATensorPtr& input,
     const std::shared_ptr<XLATensor::ShardingSpec>& sharding_spec,
@@ -1223,10 +1261,14 @@ XLATensorPtr div(const XLATensorPtr& input, const at::Scalar& other) {
   if (input_is_float) {
     scalar_type = MaybeUpcastToHostTorchType(input_type);
   }
-  torch::lazy::Value input_value = GetFloatingIrValue(input, scalar_type);
+  at::ScalarType op_math_type = at::toOpMathType(scalar_type);
+  torch::lazy::Value input_value =
+      torch::lazy::MakeNode<Cast>(input->GetIrValue(), op_math_type);
   torch::lazy::Value other_value = XLAGraphExecutor::Get()->GetIrValueForScalar(
-      other, GetXlaShape(input_value).element_type(), input->GetDevice());
-  return input->CreateFrom(Div(input_value, other_value), scalar_type);
+      other, XlaTypeFromTorchType(op_math_type), input->GetDevice());
+  return input->CreateFrom(
+      torch::lazy::MakeNode<Cast>(Div(input_value, other_value), scalar_type),
+      scalar_type);
 }
 
 XLATensorPtr einsum(const std::string& equation,
@@ -1292,6 +1334,20 @@ XLATensorPtr embedding(const XLATensorPtr& weight,
   return tensor_ops::Embedding(weight, indices);
 }
 
+std::tuple<XLATensorPtr, XLATensorPtr, XLATensorPtr, XLATensorPtr>
+embedding_bag(const XLATensorPtr& weight, const XLATensorPtr& indices,
+              const XLATensorPtr& offsets, int64_t mode,
+              const XLATensorPtr& per_sample_weights,
+              bool include_last_offset) {
+  torch::lazy::NodePtr node = torch::lazy::MakeNode<EmbeddingBag>(
+      weight->GetIrValue(), indices->GetIrValue(), offsets->GetIrValue(), mode,
+      per_sample_weights->GetIrValue(), include_last_offset);
+  return std::make_tuple(weight->CreateFrom(torch::lazy::Value(node, 0)),
+                         weight->CreateFrom(torch::lazy::Value(node, 1)),
+                         weight->CreateFrom(torch::lazy::Value(node, 2)),
+                         weight->CreateFrom(torch::lazy::Value(node, 3)));
+}
+
 XLATensorPtr exp(const XLATensorPtr& input) {
   return input->CreateFrom(Exp(input->GetIrValue()));
 }
diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
index f27465fd67d..1d565dd351a 100644
--- a/torch_xla/csrc/tensor_methods.h
+++ b/torch_xla/csrc/tensor_methods.h
@@ -80,6 +80,12 @@ std::pair<XLATensorPtr, torch::lazy::Value> collective_permute(
     const XLATensorPtr& input, const torch::lazy::Value& token,
     std::vector<std::pair<int64_t, int64_t>> source_target_pairs);
 
+std::vector<XLATensorPtr> custom_call(
+    const std::vector<XLATensorPtr>& inputs, const std::string& target,
+    const std::vector<std::vector<int64_t>>& output_shapes,
+    const std::vector<at::ScalarType>& output_dtypes, bool has_side_effect,
+    const std::string& backend_config, const int api_version);
+
 void custom_sharding_(
     const XLATensorPtr& input,
     const std::shared_ptr<XLATensor::ShardingSpec>& spec,
@@ -381,6 +387,11 @@ XLATensorPtr embedding_dense_backward(const XLATensorPtr& grad_output,
                                       int64_t num_weights, int64_t padding_idx,
                                       bool scale_grad_by_freq);
 
+std::tuple<XLATensorPtr, XLATensorPtr, XLATensorPtr, XLATensorPtr>
+embedding_bag(const XLATensorPtr& weight, const XLATensorPtr& indices,
+              const XLATensorPtr& offsets, int64_t mode,
+              const XLATensorPtr& per_sample_weights, bool include_last_offset);
+
 XLATensorPtr embedding(const XLATensorPtr& weight, const XLATensorPtr& indices);
 
 XLATensorPtr eq(const XLATensorPtr& input, const at::Scalar& other);
diff --git a/torch_xla/csrc/tensor_util.cpp b/torch_xla/csrc/tensor_util.cpp
index dd13bd63d1b..8822b6de7c4 100644
--- a/torch_xla/csrc/tensor_util.cpp
+++ b/torch_xla/csrc/tensor_util.cpp
@@ -17,6 +17,7 @@
 #include "torch_xla/csrc/dtype.h"
 #include "torch_xla/csrc/helpers.h"
 #include "torch_xla/csrc/layout_manager.h"
+#include "torch_xla/csrc/ops/device_data.h"
 #include "torch_xla/csrc/runtime/computation_client.h"
 #include "torch_xla/csrc/runtime/debug_macros.h"
 #include "torch_xla/csrc/runtime/runtime.h"
@@ -931,4 +932,24 @@ xla::PrimitiveType GetShapeDimensionType(
   return xla::PrimitiveType::S32;
 }
 
+std::shared_ptr<runtime::ComputationClient::Data> get_data_handle(
+    const at::Tensor& input) {
+  XLATensorPtr xtensor = bridge::GetXlaTensor(input);
+  if (xtensor->CurrentDataHandle() != nullptr) {
+    TF_VLOG(4) << "The xla tensor has a current data handle.";
+    return std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
+        xtensor->CurrentDataHandle());
+  } else if (xtensor->CurrentIrValue().node != nullptr) {
+    DeviceData* device_data =
+        DeviceData::Cast(xtensor->CurrentIrValue().node.get());
+    if (device_data != nullptr) {
+      return UnwrapXlaData(device_data->data());
+    }
+    TF_VLOG(4) << "The xla tensor has IR value but does not have device data.";
+  }
+  TF_VLOG(4)
+      << "The xla tensor either has no current data handle or has no IR value.";
+  return nullptr;
+}
+
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/tensor_util.h b/torch_xla/csrc/tensor_util.h
index 7d726c00b50..0804d3e9f78 100644
--- a/torch_xla/csrc/tensor_util.h
+++ b/torch_xla/csrc/tensor_util.h
@@ -212,6 +212,9 @@ inline std::vector<at::Tensor> xla_expand_outplace(at::TensorList to_expand) {
   }
 }
 
+std::shared_ptr<runtime::ComputationClient::Data> get_data_handle(
+    const at::Tensor& input);
+
 }  // namespace torch_xla
 
 #endif  // XLA_TORCH_XLA_CSRC_TENSOR_UTIL_H_
diff --git a/torch_xla/csrc/unwrap_data.h b/torch_xla/csrc/unwrap_data.h
index 7d5080e84bf..6bf1cc60e0a 100644
--- a/torch_xla/csrc/unwrap_data.h
+++ b/torch_xla/csrc/unwrap_data.h
@@ -11,6 +11,9 @@
 
 namespace torch_xla {
 
+runtime::ComputationClient::DataPtr UnwrapXlaData(
+    const torch::lazy::BackendDataPtr& data);
+
 std::vector<runtime::ComputationClient::DataPtr> UnwrapXlaData(
     absl::Span<const torch::lazy::BackendDataPtr> datas);
 
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
index fe12e392ea4..660a4a0fb18 100644
--- a/torch_xla/csrc/xla_graph_executor.cpp
+++ b/torch_xla/csrc/xla_graph_executor.cpp
@@ -45,6 +45,7 @@
 #include "torch_xla/csrc/runtime/computation_client.h"
 #include "torch_xla/csrc/runtime/debug_macros.h"
 #include "torch_xla/csrc/runtime/env_vars.h"
+#include "torch_xla/csrc/runtime/pjrt_computation_client.h"
 #include "torch_xla/csrc/runtime/runtime.h"
 #include "torch_xla/csrc/runtime/stablehlo_helper.h"
 #include "torch_xla/csrc/runtime/sys_util.h"
@@ -1396,6 +1397,7 @@ XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
   std::vector<std::shared_ptr<runtime::ComputationClient::Computation>>
       computations =
           runtime::GetComputationClient()->Compile(std::move(instances));
+  DebugUtil::post_compilation_analysis(computations[0]);
   TF_VLOG(3) << "Compiling IR graph hash "
              << torch::lazy::HashToString(coll.hash) << " on device "
              << coll.device << " done!";
diff --git a/torch_xla/csrc/xla_manual_registration.cpp b/torch_xla/csrc/xla_manual_registration.cpp
index dc7df436ec7..6020ef6bc04 100644
--- a/torch_xla/csrc/xla_manual_registration.cpp
+++ b/torch_xla/csrc/xla_manual_registration.cpp
@@ -1,7 +1,9 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#include "torch_xla/csrc/aten_cpu_fallback.h"
 #include "torch_xla/csrc/aten_xla_bridge.h"
+#include "torch_xla/csrc/debug_util.h"
 #include "torch_xla/csrc/ops/nms.h"
 #include "torch_xla/csrc/ops/ops.h"
 #include "torch_xla/csrc/tensor_methods.h"
@@ -11,10 +13,22 @@ namespace torch_xla {
 namespace manual {
 namespace {
 
+struct NmsOp {
+  using schema = at::Tensor(const at::Tensor&, const at::Tensor&, double);
+  using ptr_schema = schema*;
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "torchvision::nms")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+};
+
 at::Tensor nms_kernel(const at::Tensor& boxes, const at::Tensor& scores,
                       double iou_threshold) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
 
+  if (!DebugUtil::ExperimentEnabled("nms")) {
+    return at::native::call_fallback_fn<&xla_cpu_fallback, NmsOp>::call(
+        boxes, scores, iou_threshold);
+  }
+
   XLA_CHECK_EQ(boxes.dim(), 2) << "nms(): boxes should be a 2D tensor.";
   XLA_CHECK_EQ(boxes.size(1), 4)
       << "nms(): boxes should be a 2D tensor of shape [N, 4].";
diff --git a/torch_xla/debug/metrics.py b/torch_xla/debug/metrics.py
index 363c52a80da..11718e8376b 100644
--- a/torch_xla/debug/metrics.py
+++ b/torch_xla/debug/metrics.py
@@ -79,3 +79,8 @@ def short_metrics_report(counter_names: list = None, metric_names: list = None):
         'TransferToDeviceTime', 'TransferFromDeviceTime'
     ]
   return torch_xla._XLAC._short_xla_metrics_report(counter_names, metric_names)
+
+
+def executed_fallback_ops():
+  """Retrieves a list of operations that were run in fallback mode."""
+  return torch_xla._XLAC._get_executed_fallback_ops()
diff --git a/torch_xla/distributed/spmd/__init__.py b/torch_xla/distributed/spmd/__init__.py
index abfe1c62ba0..099f25e9fb5 100644
--- a/torch_xla/distributed/spmd/__init__.py
+++ b/torch_xla/distributed/spmd/__init__.py
@@ -27,4 +27,6 @@
     "_mark_manual_sharding",
     "enable_manual_sharding",
     "disable_manual_sharding",
+    "enable_manual_sharding",
+    "disable_manual_sharding",
 ]
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
index ff4b335058b..1a8a8cd3852 100644
--- a/torch_xla/experimental/custom_kernel.py
+++ b/torch_xla/experimental/custom_kernel.py
@@ -2,11 +2,13 @@
 import os
 import warnings
 
+import numpy as np
 import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
+import torch_xla.distributed.spmd as xs
 
-from typing import List, Callable
+from typing import Any, List, Callable
 from torch.library import impl
 from torch_xla.core.xla_model import XLA_LIB
 
@@ -17,7 +19,7 @@ def _extract_backend_config(
     module: "jaxlib.mlir._mlir_libs._mlir.ir.Module") -> str | None:
   """
   This algorithm intends to extract the backend config from the compiler IR like the following,
-  and it is designed to traverse any generic MLIR module.
+  and it is not designed to traverse any generic MLIR module.
 
   module @jit_add_vectors attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} {
     func.func public @main(%arg0: tensor<8xi32> {mhlo.layout_mode = "default", mhlo.sharding = "{replicated}"}, %arg1: tensor<8xi32> {mhlo.layout_mode = "default", mhlo.sharding = "{replicated}"}) -> (tensor<8xi32> {jax.result_info = "", mhlo.layout_mode = "default"}) {
@@ -54,6 +56,48 @@ def jax_import_guard():
   torch_xla._XLAC._init_computation_client()
 
 
+def convert_torch_dtype_to_jax(dtype: torch.dtype) -> "jnp.dtype":
+  # Import JAX within the function such that we don't need to call the jax_import_guard()
+  # in the global scope which could cause problems for xmp.spawn.
+  jax_import_guard()
+  import jax.numpy as jnp
+
+  if dtype == torch.float32:
+    if _XLA_USE_BF16:
+      return jnp.bfloat16
+    return jnp.float32
+  elif dtype == torch.float64:
+    if _XLA_USE_BF16:
+      return jnp.bfloat16
+    return jnp.float64
+  elif dtype == torch.float16:
+    return jnp.float16
+  elif dtype == torch.bfloat16:
+    return jnp.bfloat16
+  elif dtype == torch.int32:
+    return jnp.int32
+  elif dtype == torch.int64:
+    return jnp.int64
+  elif dtype == torch.int16:
+    return jnp.int16
+  elif dtype == torch.int8:
+    return jnp.int8
+  elif dtype == torch.uint8:
+    return jnp.uint8
+  else:
+    raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def to_jax_shape_dtype_struct(tensor: torch.Tensor) -> "jax.ShapeDtypeStruct":
+  # Import JAX within the function such that we don't need to call the jax_import_guard()
+  # in the global scope which could cause problems for xmp.spawn.
+  jax_import_guard()
+  import jax
+
+  return jax.ShapeDtypeStruct(tensor.shape,
+                              convert_torch_dtype_to_jax(tensor.dtype))
+
+
 def trace_pallas(kernel: Callable,
                  *args,
                  static_argnums=None,
@@ -63,43 +107,15 @@ def trace_pallas(kernel: Callable,
   # in the global scope which could cause problems for xmp.spawn.
   jax_import_guard()
   import jax
-  import jax.numpy as jnp
   import jax._src.pallas.mosaic.pallas_call_registration
 
-  def convert_torch_dtype_to_jax(dtype: torch.dtype) -> jnp.dtype:
-    if dtype == torch.float32:
-      if _XLA_USE_BF16:
-        return jnp.bfloat16
-      return jnp.float32
-    elif dtype == torch.float64:
-      if _XLA_USE_BF16:
-        return jnp.bfloat16
-      return jnp.float64
-    elif dtype == torch.float16:
-      return jnp.float16
-    elif dtype == torch.bfloat16:
-      return jnp.bfloat16
-    elif dtype == torch.int32:
-      return jnp.int32
-    elif dtype == torch.int64:
-      return jnp.int64
-    elif dtype == torch.int16:
-      return jnp.int16
-    elif dtype == torch.int8:
-      return jnp.int8
-    elif dtype == torch.uint8:
-      return jnp.uint8
-    else:
-      raise ValueError(f"Unsupported dtype: {dtype}")
-
   jax_args = []  # for tracing
   tensor_args = []  # for execution
   for i, arg in enumerate(args):
     # TODO: Could the args be a tuple of tensors or a list of tensors? Flattern them?
     if torch.is_tensor(arg):
       # ShapeDtypeStruct doesn't have any storage and thus is very suitable for generating the payload.
-      jax_meta_tensor = jax.ShapeDtypeStruct(
-          arg.shape, convert_torch_dtype_to_jax(arg.dtype))
+      jax_meta_tensor = to_jax_shape_dtype_struct(arg)
       jax_args.append(jax_meta_tensor)
       tensor_args.append(arg)
     else:
@@ -166,61 +182,129 @@ class FlashAttention(torch.autograd.Function):
       "block_k_dq": 256,
       "block_k_major_dq": 512,
   }
+  NUM_LANES = 128
+  NUM_SUBLANES = 8
+
+  @staticmethod
+  def prepare_segment_ids(q_segment_ids, kv_segment_ids):
+    from jax.experimental.pallas.ops.tpu.flash_attention import SegmentIds
+    if q_segment_ids is None or kv_segment_ids is None:
+      return None, None, None
+
+    assert q_segment_ids is not None and kv_segment_ids is not None, "Both q_segment_ids and kv_segment_ids should be provided."
+    segment_ids = SegmentIds(
+        to_jax_shape_dtype_struct(q_segment_ids),
+        to_jax_shape_dtype_struct(kv_segment_ids))
+    q_segment_ids = q_segment_ids.unsqueeze(-1).expand(
+        [-1 for _ in q_segment_ids.shape] + [FlashAttention.NUM_LANES])
+    kv_segment_ids = kv_segment_ids.unsqueeze(1).expand([
+        kv_segment_ids.shape[0], FlashAttention.NUM_SUBLANES,
+        kv_segment_ids.shape[1]
+    ])
+    return segment_ids, q_segment_ids, kv_segment_ids
 
   @staticmethod
-  def forward(ctx, q, k, v, causal=False):
+  def forward(ctx, q, k, v, causal, q_segment_ids, kv_segment_ids, sm_scale,
+              partition_spec, mesh):
     # Import JAX within the function such that we don't need to call the jax_import_guard()
     # in the global scope which could cause problems for xmp.spawn.
     jax_import_guard()
+    import jax
     from jax.experimental.pallas.ops.tpu.flash_attention import _flash_attention_impl
 
     ctx.causal = causal
+    ctx.sm_scale = sm_scale
+    ctx.partition_spec = partition_spec
+    ctx.mesh = mesh
+    ctx.full_shape = None
     save_residuals = q.requires_grad or k.requires_grad or v.requires_grad
 
-    # It returns the shape and type of o, l, m.
-    def shape_dtype(q, *arg):
-      if not save_residuals:
-        return [(q.shape, q.dtype)]
+    # SPMD integration.
+    # mark_sharding is in-placed, and therefore save the full q, k, v for the backward.
+    full_q = q
+    full_k = k
+    full_v = v
+    if partition_spec is not None:
+      ctx.full_shape = q.shape
+      q = xs.enable_manual_sharding(q, partition_spec, mesh=mesh).global_tensor
+      k = xs.enable_manual_sharding(k, partition_spec, mesh=mesh).global_tensor
+      v = xs.enable_manual_sharding(v, partition_spec, mesh=mesh).global_tensor
+
+    # It computes the shape and type of o, l, m.
+    shapes = [q.shape]
+    dtypes = [q.dtype]
+    if save_residuals:
       res_shape = list(q.shape)
       res_shape[-1] = FlashAttention.MIN_BLOCK_SIZE
-      return [(q.shape, q.dtype), (res_shape, torch.float32),
-              (res_shape, torch.float32)]
-
-    # We can't directly use flash_attention as we need to override the save_residuals flag which returns
-    # l and m that is needed for the backward. Then we lose all the shape checks.
-    # TODO: replicate the shape checks on flash_attention.
-    _flash_attention_impl = make_kernel_from_pallas(_flash_attention_impl,
-                                                    shape_dtype)
+      for _ in range(2):
+        shapes.append(res_shape)
+        dtypes.append(torch.float32)
+
     with torch.no_grad():
-      o = _flash_attention_impl(
+      segment_ids, q_segment_ids, kv_segment_ids = FlashAttention.prepare_segment_ids(
+          q_segment_ids, kv_segment_ids)
+      ctx.segment_ids = segment_ids
+
+      # We can't directly use flash_attention as we need to override the save_residuals flag which returns
+      # l and m that is needed for the backward. Then we lose all the shape checks.
+      # TODO: replicate the shape checks on flash_attention.
+      # Here we seperate the tracing and execution part just to support SegmentIds.
+      payload, _ = trace_pallas(
+          _flash_attention_impl,
           q,
           k,
           v,
           None,
-          None,
+          segment_ids,
           save_residuals,
           causal,
-          1.0,
+          sm_scale,
           min(FlashAttention.DEFAULT_BLOCK_SIZES["block_b"], q.shape[0]),
           min(FlashAttention.DEFAULT_BLOCK_SIZES["block_q"], q.shape[2]),
           min(FlashAttention.DEFAULT_BLOCK_SIZES["block_k_major"], k.shape[2]),
           min(FlashAttention.DEFAULT_BLOCK_SIZES["block_k"], k.shape[2]),
           False,
           static_argnums=range(5, 13))
+
+      args = [q, k, v]
+      if segment_ids is not None:
+        args += [q_segment_ids, kv_segment_ids]
+      o = torch_xla._XLAC._xla_tpu_custom_call(args, payload, shapes, dtypes)
+
       if not save_residuals:
+        o = o[0]
+        # SPMD integration
+        if partition_spec is not None:
+          o = xs.disable_manual_sharding(
+              o, partition_spec, ctx.full_shape, mesh=mesh).global_tensor
         return o
       o, *aux = o
       l, m = (v[..., 0] for v in aux[-2:])
 
-    ctx.save_for_backward(q, k, v, o, l, m)
+    # SPMD integration
+    if partition_spec is not None:
+      o = xs.disable_manual_sharding(
+          o, partition_spec, ctx.full_shape, mesh=mesh).global_tensor
+      l = xs.disable_manual_sharding(
+          l, partition_spec[0:3], ctx.full_shape[0:3], mesh=mesh).global_tensor
+      m = xs.disable_manual_sharding(
+          m, partition_spec[0:3], ctx.full_shape[0:3], mesh=mesh).global_tensor
+
+    ctx.save_for_backward(full_q, full_k, full_v, o, l, m, q_segment_ids,
+                          kv_segment_ids)
     return o
 
   @staticmethod
   def backward(ctx, grad_output):
     from jax.experimental.pallas.ops.tpu.flash_attention import _flash_attention_bwd_dq, _flash_attention_bwd_dkv
 
-    q, k, v, o, l, m = ctx.saved_tensors
+    q, k, v, o, l, m, q_segment_ids, kv_segment_ids = ctx.saved_tensors
     causal = ctx.causal
+    sm_scale = ctx.sm_scale
+    partition_spec = ctx.partition_spec
+    mesh = ctx.mesh
+    full_shape = ctx.full_shape
+    segment_ids = ctx.segment_ids
     grad_q = grad_k = grad_v = None
 
     grad_i = torch.sum(
@@ -234,6 +318,20 @@ def backward(ctx, grad_output):
     expanded_grad_i = grad_i.unsqueeze(-1).expand(
         [-1 for _ in grad_i.shape] + [FlashAttention.MIN_BLOCK_SIZE])
 
+    # SPMD integration
+    if partition_spec is not None:
+      q = xs.enable_manual_sharding(q, partition_spec, mesh=mesh).global_tensor
+      k = xs.enable_manual_sharding(k, partition_spec, mesh=mesh).global_tensor
+      v = xs.enable_manual_sharding(v, partition_spec, mesh=mesh).global_tensor
+      expanded_l = xs.enable_manual_sharding(
+          expanded_l, partition_spec, mesh=mesh).global_tensor
+      expanded_m = xs.enable_manual_sharding(
+          expanded_m, partition_spec, mesh=mesh).global_tensor
+      grad_output = xs.enable_manual_sharding(
+          grad_output, partition_spec, mesh=mesh).global_tensor
+      expanded_grad_i = xs.enable_manual_sharding(
+          expanded_grad_i, partition_spec, mesh=mesh).global_tensor
+
     if ctx.needs_input_grad[0]:
       payload, _ = trace_pallas(
           _flash_attention_bwd_dq,
@@ -241,7 +339,7 @@ def backward(ctx, grad_output):
           k,
           v,
           None,
-          None,
+          segment_ids,
           l,
           m,
           grad_output,
@@ -253,7 +351,7 @@ def backward(ctx, grad_output):
               k.shape[2]),
           block_k=min(FlashAttention.DEFAULT_BLOCK_SIZES["block_k_dq"],
                       k.shape[2]),
-          sm_scale=1.0,
+          sm_scale=sm_scale,
           causal=causal,
           mask_value=FlashAttention.DEFAULT_MASK_VALUE,
           debug=False,
@@ -261,9 +359,13 @@ def backward(ctx, grad_output):
               "block_q_major", "block_k_major", "block_k", "sm_scale", "causal",
               "mask_value", "debug"
           ])
-      grad_q = torch_xla._XLAC._xla_tpu_custom_call(
-          [q, k, v, expanded_l, expanded_m, grad_output, expanded_grad_i],
-          payload, [q.shape], [q.dtype])[0]
+
+      args = [q, k, v]
+      if segment_ids is not None:
+        args += [q_segment_ids, kv_segment_ids]
+      args += [expanded_l, expanded_m, grad_output, expanded_grad_i]
+      grad_q = torch_xla._XLAC._xla_tpu_custom_call(args, payload, [q.shape],
+                                                    [q.dtype])[0]
 
     if ctx.needs_input_grad[1] or ctx.needs_input_grad[2]:
       payload, _ = trace_pallas(
@@ -272,7 +374,7 @@ def backward(ctx, grad_output):
           k,
           v,
           None,
-          None,
+          segment_ids,
           l,
           m,
           grad_output,
@@ -287,7 +389,7 @@ def backward(ctx, grad_output):
                       k.shape[2]),
           block_q=min(FlashAttention.DEFAULT_BLOCK_SIZES["block_q_dkv"],
                       q.shape[2]),
-          sm_scale=1.0,
+          sm_scale=sm_scale,
           causal=causal,
           mask_value=FlashAttention.DEFAULT_MASK_VALUE,
           debug=False,
@@ -295,15 +397,29 @@ def backward(ctx, grad_output):
               "block_q_major", "block_k_major", "block_k", "block_q",
               "sm_scale", "causal", "mask_value", "debug"
           ])
-      grads = torch_xla._XLAC._xla_tpu_custom_call(
-          [q, k, v, expanded_l, expanded_m, grad_output, expanded_grad_i],
-          payload, [k.shape, v.shape], [k.dtype, v.dtype])
+
+      args = [q, k, v]
+      if segment_ids is not None:
+        args += [q_segment_ids, kv_segment_ids]
+      args += [expanded_l, expanded_m, grad_output, expanded_grad_i]
+      grads = torch_xla._XLAC._xla_tpu_custom_call(args, payload,
+                                                   [k.shape, v.shape],
+                                                   [k.dtype, v.dtype])
     if ctx.needs_input_grad[1]:
       grad_k = grads[0]
     if ctx.needs_input_grad[2]:
       grad_v = grads[1]
 
-    return grad_q, grad_k, grad_v, None
+    # SPMD integration
+    if partition_spec is not None:
+      grad_q = xs.disable_manual_sharding(
+          grad_q, partition_spec, full_shape, mesh=mesh).global_tensor
+      grad_k = xs.disable_manual_sharding(
+          grad_k, partition_spec, full_shape, mesh=mesh).global_tensor
+      grad_v = xs.disable_manual_sharding(
+          grad_v, partition_spec, full_shape, mesh=mesh).global_tensor
+
+    return grad_q, grad_k, grad_v, None, None, None, None, None, None
 
 
 def flash_attention(
@@ -311,8 +427,430 @@ def flash_attention(
     k,  # [batch_size, num_heads, kv_seq_len, d_model]
     v,  # [batch_size, num_heads, kv_seq_len, d_model]
     causal=False,
-):
-  return FlashAttention.apply(q, k, v, causal)
+    q_segment_ids=None,  # [batch_size, q_seq_len]
+    kv_segment_ids=None,  # [batch_size, kv_seq_len]
+    sm_scale=1.0,
+    *,
+    partition_spec=None,
+    mesh=None):
+  # TODO: support SPMD and Dynamo with segment_ids.
+  return FlashAttention.apply(q, k, v, causal, q_segment_ids, kv_segment_ids,
+                              sm_scale, partition_spec, mesh)
+
+
+def paged_attention(q,
+                    k_pages,
+                    v_pages,
+                    lengths,
+                    page_indices,
+                    pages_per_compute_block,
+                    megacore_mode: str = None):
+  # Import JAX within the function such that we don't need to call the jax_import_guard()
+  # in the global scope which could cause problems for xmp.spawn.
+  jax_import_guard()
+  from jax.experimental.pallas.ops.tpu.paged_attention.paged_attention_kernel import paged_attention
+
+  assert megacore_mode in [
+      "kv_head", "batch", None
+  ], "megacore_mode must be one of ['kv_head', 'batch', None]."
+
+  payload, tensor_args = trace_pallas(
+      paged_attention,
+      q,
+      k_pages,
+      v_pages,
+      lengths,
+      page_indices,
+      pages_per_compute_block=pages_per_compute_block,
+      megacore_mode=megacore_mode,
+      static_argnames=["pages_per_compute_block", "megacore_mode"],
+  )
+
+  batch_size, num_heads, head_dim = q.shape
+  num_kv_heads, _, page_size, head_dim_k = k_pages.shape
+  batch_size_paged_indices, pages_per_sequence = page_indices.shape
+  q_dtype_for_kernel_launch = q.dtype
+  if (num_heads // num_kv_heads) % 8 != 0:
+    q = q.reshape(batch_size, num_heads, 1, head_dim)
+    q_dtype_for_kernel_launch = torch.float32
+
+  page_indices_reshaped = page_indices.reshape(-1)
+  buffer_index = torch.zeros((1,), dtype=torch.int32).to("xla")
+  step = torch.zeros((1,), dtype=torch.int32).to("xla")
+  output_shape = torch.Size(list(q.shape[:-1]) + [1])
+
+  output, _, _ = torch_xla._XLAC._xla_tpu_custom_call(
+      [
+          lengths,
+          page_indices_reshaped,
+          buffer_index,
+          step,
+          q.to(q_dtype_for_kernel_launch),
+          k_pages,
+          v_pages,
+      ], payload, [q.shape, output_shape, output_shape],
+      [q_dtype_for_kernel_launch, torch.float32, torch.float32])
+
+  return output.reshape(batch_size, num_heads, head_dim).to(q.dtype)
+
+
+def _calculate_num_tiles(x: int, tx: int) -> int:
+  tiles, rem = divmod(x, tx)
+  if rem:
+    raise ValueError(f"{x} must be divisible by x-dimension tile size ({tx}).")
+  return tiles
+
+
+def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
+  """
+  Compute the histogram of a int32 tensor. The bin edges are defined by the min and max values, with step = 1.
+  """
+  assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
+  assert min <= max, "min must be less than or equal to max."
+
+  def searchsorted(sorted_sequence: torch.Tensor,
+                   values_to_search: torch.Tensor) -> torch.Tensor:
+    return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
+
+  bin_edges = torch.linspace(
+      min, max, max - min + 1, dtype=input.dtype).to(input.device)
+  return searchsorted(bin_edges, input)
+
+
+# Refence: https://github.com/google/jax/blob/main/jax/experimental/pallas/ops/tpu/megablox/gmm.py#L78
+def _make_group_metadata(
+    *,
+    group_sizes: torch.Tensor,
+    m: int,
+    tm: int,
+    visit_empty_groups: bool,
+) -> Any:
+  """Create the metadata needed for grouped matmul computation.
+
+  Args:
+    group_sizes: A 1d, torch.Tensor with shape [num_groups] and torch.int32 dtype.
+    m: The number of rows in lhs.
+    tm: The m-dimension tile size being used.
+    visit_empty_groups: If True, do not squeeze tiles for empty groups out of
+      the metadata. This is necessary for tgmm, where we at least need to zero
+      the output for each group.
+
+  Returns:
+    tuple of:
+      group_offsets: A 1d, torch.Tensor with shape [num_groups + 1] and torch.int32
+        dtype. group_offsets[i] indicates the row at which group [i] starts in
+        the lhs matrix and group_offsets[i-1] = m.
+      group_ids: A 1d, torch.Tensor with shape [m_tiles + num_groups - 1] and
+        torch.int32 dtype. group_ids[i] indicates which group grid index 'i' will
+        work on.
+      m_tile_ids: A 1d, torch.Tensor with shape [m_tiles + num_groups - 1] and
+        torch.int32. m_tile_ids[i] indicates which m-dimension tile grid index 'i'
+        will work on.
+    num_tiles: The number of m-dimension tiles to execute including overlapping
+      executions. And don't confuse this with m_tiles which is m // tm.
+  """
+  assert group_sizes.dtype == torch.int32, "group_sizes must be of torch.int32 dtype."
+
+  device = group_sizes.device
+  num_groups = group_sizes.shape[0]
+
+  # Calculate the offset of each group, starting at zero. This metadata is
+  # similar to row offsets in a CSR matrix. The following properties hold:
+  #
+  # group_offsets.shape = [num_groups + 1]
+  # group_offsets[0] = 0
+  # group_offsets[num_groups] = m
+  #
+  # The row at which group 'i' starts is group_offsets[i].
+  group_ends = torch.cumsum(group_sizes, dim=0, dtype=torch.int32)
+  group_offsets = torch.cat(
+      [torch.zeros(1, dtype=torch.int32).to(device), group_ends])
+
+  # Assign a group id to each grid index.
+  #
+  # If a group starts somewhere other than the start of a tile or ends somewhere
+  # other than the end of a tile we need to compute that full tile. Calculate
+  # the number of tiles for each group by rounding their end up to the nearest
+  # 'tm' and their start down to the nearest 'tm'.
+
+  # (1) Round the group_ends up to the nearest multiple of 'tm'.
+  #
+  # NOTE: This does not change group_offsets[num_groups], which is m
+  # (because we enforce m is divisible by tm).
+  rounded_group_ends = ((group_ends + tm - 1) // tm * tm).to(torch.int32)
+
+  # (2) Round the group_starts down to the nearest multiple of 'tm'.
+  group_starts = torch.cat(
+      [torch.zeros(1, dtype=torch.int32).to(device), group_ends[:-1]])
+  rounded_group_starts = group_starts // tm * tm
+
+  # (3) Calculate the number of rows in each group.
+  #
+  # NOTE: Handle zero-sized groups as a special case. If the start for a
+  # zero-sized group is not divisible by 'tm' its start will be rounded down and
+  # its end will be rounded up such that its size will become 1 tile here.
+  rounded_group_sizes = rounded_group_ends - rounded_group_starts
+  rounded_group_sizes = torch.where(group_sizes == 0, 0, rounded_group_sizes)
+
+  # (4) Convert the group sizes from units of rows to unit of 'tm' sized tiles.
+  #
+  # An m-dimension tile is 'owned' by group 'i' if the first row of the tile
+  # belongs to group 'i'. In addition to owned tiles, each group can have 0 or 1
+  # initial partial tiles if it's first row does not occur in the first row of a
+  # tile. The '0-th' group never has a partial tile because it always starts at
+  # the 0-th row.
+  #
+  # If no group has a partial tile, the total number of tiles is equal to
+  # 'm // tm'. If every group has a partial except the 0-th group, the total
+  # number of tiles is equal to 'm // tm + num_groups - 1'. Thus we know that
+  #
+  # tiles_m <= group_tiles.sum() <= tiles_m + num_groups - 1
+  #
+  # Where tiles_m = m // tm.
+  #
+  # NOTE: All group sizes are divisible by 'tm' because of the rounding in steps
+  # (1) and (2) so this division is exact.
+  group_tiles = rounded_group_sizes // tm
+
+  if visit_empty_groups:
+    # Insert one tile for empty groups.
+    group_tiles = torch.where(group_sizes == 0, 1, group_tiles)
+
+  # Create the group ids for each grid index based on the tile counts for each
+  # group.
+  #
+  # NOTE: This repeat(...) will pad group_ids with the final group id if
+  # group_tiles.sum() < tiles_m + num_groups - 1. The kernel grid will be sized
+  # such that we only execute the necessary number of tiles.
+  tiles_m = _calculate_num_tiles(m, tm)
+  group_ids = repeat_with_fixed_output_size(
+      torch.arange(num_groups, dtype=torch.int32).to(device), group_tiles,
+      tiles_m + num_groups - 1)
+
+  # Assign an m-dimension tile id to each grid index.
+  #
+  # NOTE: Output tiles can only be re-visited consecutively. The following
+  # procedure guarantees that m-dimension tile indices respect this.
+
+  # (1) Calculate how many times each m-dimension tile will be visited.
+  #
+  # Each tile is guaranteed to be visited once by the group that owns the tile.
+  # The remaining possible visits occur when a group starts inside of a tile at
+  # a position other than the first row. We can calculate which m-dimension tile
+  # each group starts in by floor-dividing its offset with `tm` and then count
+  # tile visits with a histogram.
+  #
+  # To avoid double counting tile visits from the group that owns the tile,
+  # filter these out by assigning their tile id to `tile_m` (one beyond the max)
+  # such that they're ignored by the subsequent histogram. Also filter out any
+  # group which is empty.
+  #
+  # TODO(tgale): Invert the 'partial_tile_mask' predicates to be more clear.
+  partial_tile_mask = torch.logical_or((group_offsets[:-1] % tm) == 0,
+                                       group_sizes == 0)
+
+  # Explicitly enable tiles for zero sized groups, if specified. This covers
+  # zero sized groups that start on a tile-aligned row and those that do not.
+  if visit_empty_groups:
+    partial_tile_mask = torch.where(group_sizes == 0, False, partial_tile_mask)
+
+  partial_tile_ids = torch.where(partial_tile_mask, tiles_m,
+                                 group_offsets[:-1] // tm)
+
+  tile_visits = (_histogram(partial_tile_ids, min=0, max=tiles_m - 1) + 1)
+
+  # Create the m-dimension tile ids for each grid index based on the visit
+  # counts for each tile.
+  m_tile_ids = repeat_with_fixed_output_size(
+      torch.arange(tiles_m, dtype=torch.int32).to(device), tile_visits,
+      tiles_m + num_groups - 1)
+
+  num_tiles = group_tiles.sum(dtype=torch.int32)
+  return group_offsets, group_ids, m_tile_ids, num_tiles
+
+
+# Repeat the `input` tensor `repeats` number of times. We expect `input` and
+# `repeats` both be 1d tensor with same shape. output shape will be [total_repeat_length].
+# If `total_repeat_length` is larger than the repeated tensor length we will use the last value
+# in the `input` to fill it up. If `total_repeat_length` is smaller than repeated tensor length
+# we will truncate the repeated tensor.
+def repeat_with_fixed_output_size(input: torch.Tensor, repeats: torch.Tensor,
+                                  total_repeat_length: int):
+  # currently only support 1d input and 1d repeats
+  assert len(input.size()) == 1
+  assert len(repeats.size()) == 1
+  device = input.device
+
+  # to better understand this code, let's assume
+  # input.size() = [10]
+  # repeats = [0, 1, 2, 0, 4, 0, 6, 7, 8, 9]
+  # total_repeat_length = 20
+
+  # shift the repeats by one
+  # tensor([0, 0, 1, 2, 0, 4, 0, 6, 7, 8])
+  exclusive_repeats = torch.roll(repeats, shifts=1)
+  exclusive_repeats[0] = 0
+
+  # tensor([ 0,  0,  1,  3,  3,  7,  7, 13, 20, 28])
+  scatter_indices = torch.cumsum(exclusive_repeats, dim=0)
+  # set the out of bound indices to 0 and calculate how many of them.
+  # tensor([ 0,  0,  1,  3,  3,  7,  7, 13,  0,  0])
+  valid_indices = torch.where(scatter_indices >= total_repeat_length,
+                              torch.zeros_like(scatter_indices),
+                              scatter_indices)
+  out_of_bound_count = torch.where(scatter_indices >= total_repeat_length, 1,
+                                   0).sum()
+
+  # tensor([2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])
+  block_split_indicators = torch.zeros(
+      total_repeat_length, dtype=torch.int32, device=device)
+  block_split_indicators.scatter_add_(0, valid_indices.to(torch.int64),
+                                      torch.ones_like(block_split_indicators))
+  # out_of_bound indices also scatter to index 0, need to offset them
+  block_split_indicators[0] -= out_of_bound_count
+
+  # value in gather_indices represents the index in the input.
+  # tensor([1, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7])
+  gather_indices = torch.cumsum(block_split_indicators, dim=0) - 1
+  res = torch.gather(input, 0, gather_indices)
+  return res
+
+
+def gmm(
+    lhs: torch.Tensor,
+    rhs: torch.Tensor,
+    group_sizes: torch.Tensor,
+    tiling: tuple[int, int, int] = (512, 512, 512)
+) -> torch.Tensor:
+  """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
+
+  Args:
+    lhs: A 2d, torch.Tensor with shape [m, k].
+    rhs: A 3d, torch.Tensor with shape [num_groups, k, n].
+    group_sizes: A 1d, torch.Tensor with shape [num_groups] and torch.int32 dtype.
+    tiling: 3-tuple of ints. The m, k and n-dimension tile sizes.
+
+  Returns:
+    A 2d, torch.Tensor with shape [m, n].
+  """
+  # Import JAX within the function such that we don't need to call the jax_import_guard()
+  # in the global scope which could cause problems for xmp.spawn.
+  jax_import_guard()
+  from jax.experimental.pallas.ops.tpu.megablox.gmm import gmm
+
+  m, k, n = lhs.shape[0], lhs.shape[1], rhs.shape[2]
+  tm, tk, tn = min(tiling[0], m), min(tiling[1], k), min(tiling[2], n)
+  preferred_element_type = lhs.dtype
+
+  payload, _ = trace_pallas(
+      gmm,
+      lhs,
+      rhs,
+      group_sizes,
+      static_argnames=["tiling", "preferred_element_type"],
+      preferred_element_type=convert_torch_dtype_to_jax(preferred_element_type),
+      tiling=(tm, tk, tn))
+
+  # Create the metadata we need for computation, and that's why need to separate
+  # the tracing and execution part.
+  group_offsets, group_ids, m_tile_ids, num_tiles = _make_group_metadata(
+      group_sizes=group_sizes,
+      m=m,
+      tm=tm,
+      visit_empty_groups=False,
+  )
+  group_offset_torch = torch.tensor([0], dtype=torch.int32).to(lhs.device)
+
+  return torch_xla._XLAC._xla_tpu_custom_call([
+      num_tiles, group_offsets, group_ids, m_tile_ids, group_offset_torch, lhs,
+      rhs
+  ], payload, [torch.Size([m, n])], [preferred_element_type])[0]
+
+
+def tgmm(
+    lhs: torch.Tensor,
+    rhs: torch.Tensor,
+    group_sizes: torch.Tensor,
+    tiling: tuple[int, int, int] = (512, 512, 512)
+) -> torch.Tensor:
+  """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
+
+  Args:
+    lhs: A 2d, torch.Tensor with shape [k, m].
+    rhs: A 2d, torch.Tensor with shape [m, n].
+    group_sizes: A 1d, torch.Tensor with shape [num_groups] and torch.int32 dtype.
+    tiling: 3-tuple of ints. The m, k and n-dimension tile sizes.
+
+  Returns:
+    A  3d, torch.Tensor with shape [num_groups, k, n].
+  """
+  # Import JAX within the function such that we don't need to call the jax_import_guard()
+  # in the global scope which could cause problems for xmp.spawn.
+  jax_import_guard()
+  from jax.experimental.pallas.ops.tpu.megablox.gmm import tgmm
+
+  k, m, n, num_groups = lhs.shape[0], lhs.shape[1], rhs.shape[
+      1], group_sizes.shape[0]
+  tm, tk, tn = min(tiling[0], m), min(tiling[1], k), min(tiling[2], n)
+  preferred_element_type = lhs.dtype
+
+  payload, _ = trace_pallas(
+      tgmm,
+      lhs,
+      rhs,
+      group_sizes,
+      static_argnames=["tiling", "preferred_element_type"],
+      preferred_element_type=convert_torch_dtype_to_jax(preferred_element_type),
+      tiling=(tm, tk, tn))
+
+  # Create the metadata we need for computation, and that's why need to separate
+  # the tracing and execution part.
+  group_offsets, group_ids, m_tile_ids, num_tiles = _make_group_metadata(
+      group_sizes=group_sizes,
+      m=m,
+      tm=tm,
+      visit_empty_groups=True,
+  )
+  group_offset_torch = torch.tensor([0], dtype=torch.int32).to(lhs.device)
+
+  return torch_xla._XLAC._xla_tpu_custom_call([
+      num_tiles, group_offsets, group_ids, m_tile_ids, group_offset_torch,
+      lhs.t(), rhs
+  ], payload, [torch.Size([num_groups, k, n])], [preferred_element_type])[0]
+
+
+def gmm_backward(grad, lhs, rhs, group_sizes, tiling=(512, 512, 512)):
+  grad_lhs = gmm(grad, rhs.transpose(-1, -2), group_sizes, tiling)
+  grad_rhs = tgmm(lhs.t(), grad, group_sizes, tiling)
+  return grad_lhs, grad_rhs
+
+
+class GMM(torch.autograd.Function):
+
+  @staticmethod
+  def forward(ctx, lhs, rhs, group_sizes, tiling=(512, 512, 512)):
+    ctx.save_for_backward(lhs, rhs, group_sizes)
+    ctx.tiling = tiling
+    return gmm(lhs, rhs, group_sizes, tiling)
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    lhs, rhs, group_sizes = ctx.saved_tensors
+    grad_lhs, grad_rhs = gmm_backward(grad_output, lhs, rhs, group_sizes,
+                                      ctx.tiling)
+    return grad_lhs, grad_rhs, None, None
+
+
+def non_xla_attetion(q, k, v, attention_type):
+  # This will be called when dynamo use fake tensor to construct the fake output.
+  # We need to make sure output tensor's shape is correct.
+  if k.device != torch.device("meta"):
+    warnings.warn(
+        f'XLA {attention_type} attention should only be applied to tensors on XLA device'
+    )
+
+  # Return orignal shape of q.
+  return torch.empty_like(q)
 
 
 XLA_LIB.define(
@@ -333,14 +871,32 @@ def flash_attention_non_xla(q: torch.Tensor,
                             k: torch.Tensor,
                             v: torch.Tensor,
                             causal: bool = False):
-  # This will be called when dynamo use fake tensor to construct the fake output.
-  # We need to make sure output tensor's shape is correct.
-  if k.device != torch.device("meta"):
-    warnings.warn(
-        'XLA flash attention should only be applied to tensors on XLA device')
+  return non_xla_attetion(q, k, v, "flash")
+
+
+XLA_LIB.define(
+    "paged_attention(Tensor q, Tensor k_pages, Tensor v_pages, Tensor lengths, Tensor page_indices, int pages_per_compute_block, str megacore_mode=None) -> Tensor",
+)
+
 
-  # perform a regular attention if input tensors are not on XLA device.
-  attn_weight = q @ k.transpose(-2, -1)
-  attn_weight = torch.nn.functional.softmax(attn_weight, dim=-1)
-  attn_output = attn_weight @ v
-  return attn_output
+@impl(XLA_LIB, "paged_attention", "XLA")
+def paged_attention_xla(q: torch.Tensor,
+                        k_pages: torch.Tensor,
+                        v_pages: torch.Tensor,
+                        lengths: torch.Tensor,
+                        page_indices: torch.Tensor,
+                        pages_per_compute_block: int,
+                        megacore_mode: str = None):
+  return paged_attention(q, k_pages, v_pages, lengths, page_indices,
+                         pages_per_compute_block, megacore_mode)
+
+
+@impl(XLA_LIB, "paged_attention", "CompositeExplicitAutograd")
+def paged_attention_non_xla(q: torch.Tensor,
+                            k_pages: torch.Tensor,
+                            v_pages: torch.Tensor,
+                            lengths: torch.Tensor,
+                            page_indices: torch.Tensor,
+                            pages_per_compute_block: int,
+                            megacore_mode: str = None):
+  return non_xla_attetion(q, k_pages, v_pages, "paged")
diff --git a/torch_xla/experimental/distributed_checkpoint/__init__.py b/torch_xla/experimental/distributed_checkpoint/__init__.py
index cad57c3a405..a29b943f217 100644
--- a/torch_xla/experimental/distributed_checkpoint/__init__.py
+++ b/torch_xla/experimental/distributed_checkpoint/__init__.py
@@ -1,8 +1,10 @@
 from .manager import CheckpointManager
 from .planners import SPMDSavePlanner, SPMDLoadPlanner
+from .util import prime_optimizer
 
 __all__ = [
     "CheckpointManager",
     "SPMDSavePlanner",
     "SPMDLoadPlanner",
+    "prime_optimizer",
 ]
diff --git a/torch_xla/experimental/distributed_checkpoint/manager.py b/torch_xla/experimental/distributed_checkpoint/manager.py
index 4ce57b5fb38..5d4ce7814e2 100644
--- a/torch_xla/experimental/distributed_checkpoint/manager.py
+++ b/torch_xla/experimental/distributed_checkpoint/manager.py
@@ -66,7 +66,10 @@ class CheckpointManager:
   >>> if tracked_steps:
   >>>   # Choose the highest step
   >>>   best_step = max(tracked_steps)
-  >>>   state_dict = {'model': model.state_dict()}
+  >>>   # Before restoring the checkpoint, the optimizer state must be primed
+  >>>   # to allow state to be loaded into it.
+  >>>   prime_optimizer(optim)
+  >>>   state_dict = {'model': model.state_dict(), 'optim': optim.state_dict()}
   >>>   chkpt_mgr.restore(best_step, state_dict)
   >>>   model.load_state_dict(state_dict['model'])
 
diff --git a/torch_xla/experimental/distributed_checkpoint/util.py b/torch_xla/experimental/distributed_checkpoint/util.py
new file mode 100644
index 00000000000..198cb350323
--- /dev/null
+++ b/torch_xla/experimental/distributed_checkpoint/util.py
@@ -0,0 +1,44 @@
+import torch
+from torch.utils._pytree import tree_map
+import torch_xla
+import torch_xla.core.xla_model as xm
+
+
+def prime_optimizer(optimizer: torch.optim.Optimizer) -> torch.optim.Optimizer:
+  """
+  Prime the optimizer state by running a dummy weight update.
+
+  Optimizer state isn't created until after the first training step. Since the
+  distributed checkpointing library loads the state_dict in-place, the
+  optimizer state must already exist before loading the checkpoint.
+
+  This utility method runs a dummy weight update with zero gradient to ensure
+  the optimizer state exists and can be loaded into.
+
+  **Warning** This method calls `optimizer.step`, which can impact the
+  optimizer's state and model parameters. Therefore, it should only be used
+  prior to restoring a checkpoint, when the state and parameters will be
+  immediately overwritten.
+
+  Args:
+    optimizer: The optimizer whose state should be primed for checkpoint
+               loading.
+  """
+
+  # Initial mark_step to ensure all param_groups are backed by device data.
+  xm.mark_step()
+  xm.wait_device_ops()
+
+  def zero_grad(x):
+    if isinstance(x, torch.Tensor) and x.requires_grad:
+      x.grad = torch.zeros_like(x, requires_grad=False)
+      param_sharding = torch_xla._XLAC._get_xla_op_sharding(x)
+      if param_sharding:
+        # Match the gradient sharding to the parameter's.
+        torch_xla._XLAC._xla_mark_sharding(x.grad, param_sharding)
+
+  tree_map(zero_grad, optimizer.param_groups)
+  optimizer.step()
+  xm.mark_step()
+  xm.wait_device_ops()
+  return optimizer
diff --git a/torch_xla/experimental/plugins.py b/torch_xla/experimental/plugins.py
index 77c2a572de3..620dff7e45c 100644
--- a/torch_xla/experimental/plugins.py
+++ b/torch_xla/experimental/plugins.py
@@ -76,7 +76,9 @@ def use_dynamic_plugins():
 
 
 def using_dynamic_plugins():
-  return xu.getenv_as(xenv.PJRT_DYNAMIC_PLUGINS, bool, False)
+  # TODO: dummy plugin for CPU
+  return xu.getenv_as(xenv.PJRT_DYNAMIC_PLUGINS, bool,
+                      False) and xr.device_type() != "CPU"
 
 
 def default() -> DevicePlugin:
diff --git a/torch_xla/experimental/spmd_fully_sharded_data_parallel.py b/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
index 461d66b8565..142f9bc7561 100644
--- a/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
+++ b/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
@@ -13,17 +13,24 @@
 from torch_xla.distributed.fsdp.wrap import recursive_wrap
 
 
-def _prepare_spmd_partition_spec(param):
-  partition_spec = [None] * len(param.shape)
+def _prepare_spmd_partition_spec(param,
+                                 extra_data_axis=None,
+                                 shard_maximal=False):
+  shape = param.shape
+  partition_spec = [None] * len(shape)
   # Skip scalar tensors and it replicated.
   if len(partition_spec) == 0:
     return partition_spec
 
-  # Only shard the 0th dimension of the parameter according to the
-  # fsdp axis of the mesh.
-  # TODO: should we shard on the maximal dim for param? Then we need
-  # another helper for the output.
-  partition_spec[0] = "fsdp"
+  # Shard the 0th dimension of the parameter according to the
+  # fsdp axis of the mesh, if shard_maximal is not specified.
+  index = 0
+  if shard_maximal:
+    index = shape.index(max(shape))
+
+  partition_spec[index] = "fsdp"
+  if extra_data_axis:
+    partition_spec[index] = (extra_data_axis, "fsdp")
   return tuple(partition_spec)
 
 
@@ -44,10 +51,12 @@ class SpmdFullyShardedDataParallel(nn.Module):
   def __init__(
       self,
       module: nn.Module,
+      *,
       mesh: Optional[spmd.Mesh] = None,
       shard_output: Optional[Callable] = None,
       auto_wrap_policy: Optional[Callable] = None,
       auto_wrapper_callable: Optional[Callable] = None,
+      extra_data_axis: Optional[str] = None,
   ):
     if isinstance(module, SpmdFullyShardedDataParallel):
       raise RuntimeError(
@@ -74,6 +83,9 @@ def __init__(
         )
     if "fsdp" not in mesh.axis_names:
       raise ValueError("The mesh must have an axis named 'fsdp'.")
+    if extra_data_axis and extra_data_axis not in mesh.axis_names:
+      raise ValueError(
+          f"The provided {extra_data_axis} axis is not in the mesh.")
 
     super().__init__()
 
@@ -106,7 +118,8 @@ def __init__(
     for param in module.parameters():
       if torch_xla._XLAC._get_xla_sharding_spec(param) != "":
         continue
-      spmd.mark_sharding(param, mesh, _prepare_spmd_partition_spec(param))
+      spmd.mark_sharding(
+          param, mesh, _prepare_spmd_partition_spec(param, shard_maximal=True))
 
     # Register a backward hook to place optimization barrier to prevent
     # gigantic fusions on syncing the gradients.
@@ -130,8 +143,9 @@ def shard_output_impl(output, mesh):
               f"The output type is not supported: {type(output)}. Please provide your own shard_output callable."
           )
 
-        spmd.mark_sharding(real_output, mesh,
-                           _prepare_spmd_partition_spec(real_output))
+        spmd.mark_sharding(
+            real_output, mesh,
+            _prepare_spmd_partition_spec(real_output, extra_data_axis))
 
       shard_output = shard_output_impl
 
diff --git a/torch_xla/experimental/stablehlo_custom_call.py b/torch_xla/experimental/stablehlo_custom_call.py
new file mode 100644
index 00000000000..e729d0b7791
--- /dev/null
+++ b/torch_xla/experimental/stablehlo_custom_call.py
@@ -0,0 +1,31 @@
+import torch
+import torch_xla
+
+
+# TODO(lsy323): Register as a torch op, cannot do that because parameter
+# `ScalarType[] output_dtypes` in the op schema has some problem.
+def stablehlo_custom_call(args,
+                          call_target,
+                          output_shapes,
+                          output_dtypes,
+                          has_side_effect=False,
+                          backend_config="",
+                          api_version=0):
+  res = torch_xla._XLAC._xla_custom_call(args, call_target, output_shapes,
+                                         output_dtypes, has_side_effect,
+                                         backend_config, api_version)
+  if len(output_shapes) == 1:
+    return res[0]
+  return res
+
+
+def extract_custom_call_outputs_shape_dtype(n: torch.fx.Node):
+  assert 'val' in n.meta
+  if isinstance(n.meta['val'], torch.Tensor):
+    return [n.meta['val'].shape], [n.meta['val'].dtype]
+  output_shape_dtype = [(t.shape,
+                         t.dtype) if isinstance(t, torch.Tensor) else None
+                        for t in n.meta['val']]
+  assert None not in output_shape_dtype
+  output_shape, output_dtype = zip(*output_shape_dtype)
+  return output_shape, output_dtype
diff --git a/torch_xla/experimental/xla_mlir_debuginfo.py b/torch_xla/experimental/xla_mlir_debuginfo.py
index 4d12e20b2b2..e74617123be 100644
--- a/torch_xla/experimental/xla_mlir_debuginfo.py
+++ b/torch_xla/experimental/xla_mlir_debuginfo.py
@@ -8,7 +8,6 @@
 # Enable debug info automatically when importing this file. This is necessary
 # to propagate any debug info to downstream MLIR locations.
 os.environ["XLA_HLO_DEBUG"] = "1"
-xla_device = xm.xla_device()
 
 XLA_LIB.define("write_mlir_debuginfo(Tensor x, str data) -> Tensor")
 
@@ -31,7 +30,7 @@ def write_mlir_debuginfo(x, data: str):
 
 @torch.library.impl(XLA_LIB, "write_mlir_debuginfo",
                     "CompositeExplicitAutograd")
-def write_mlir_debuginfo(x, data: str):
+def write_mlir_debuginfo_tensor(x, data: str):
   return x
 
 
diff --git a/torch_xla/stablehlo.py b/torch_xla/stablehlo.py
index 3642354ab91..adf642f37a6 100644
--- a/torch_xla/stablehlo.py
+++ b/torch_xla/stablehlo.py
@@ -1,24 +1,27 @@
 import copy
-from dataclasses import dataclass
+import dataclasses
 import enum
 import json
 import os
-from typing import List, Tuple, Optional, Mapping, Any, Dict
-import dataclasses
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Mapping, Optional, Set, Tuple
 
 import numpy as np
 import torch
-from torch.fx import _pytree as fx_pytree
 import torch_xla
-from torch_xla.core import xla_model as xm
-from torch_xla.core import dynamo_bridge
-from torch_xla.debug import metrics
 import torch_xla.experimental.quantized
-from torch_xla.experimental.unbounded_dynamism_export import exported_program_has_symbolic_input_shape, process_exported_program_with_symbolic_input
-from torch.utils import _pytree as pytree
 from torch._decomp import get_decompositions
-
-from typing import Tuple
+from torch._export.serde.serialize import GraphModuleSerializer
+from torch.fx import _pytree as fx_pytree
+from torch.utils import _pytree as pytree
+from torch_xla.core import dynamo_bridge
+from torch_xla.core import xla_model as xm
+from torch_xla.debug import metrics
+from torch_xla.experimental.stablehlo_custom_call import (
+    extract_custom_call_outputs_shape_dtype, stablehlo_custom_call)
+from torch_xla.experimental.unbounded_dynamism_export import (
+    exported_program_has_symbolic_input_shape,
+    process_exported_program_with_symbolic_input)
 
 
 def _get_numpy_dtype(dtype):
@@ -59,6 +62,11 @@ class StableHLOExportOptions:
 
   # Whether to export the weights
   export_weights: bool = True
+  # Ops that will be mapped to stablehlo.custom_call in the
+  # exported StableHLO graph.
+  custom_ops_allowed_in_graph: Set[str] = field(default_factory=set)
+  # Export node metadata to NamedLoc in StableHLO.
+  export_node_metadata: bool = False
 
 
 class StableHLOGraphModule:
@@ -214,10 +222,13 @@ class StableHLOModelBundle:
 
 class XLAExportInterpreter(torch.fx.Interpreter):
 
-  def __init__(self, module, device):
+  def __init__(self, module, device, custom_ops_allowed_in_graph,
+               gm_serializer):
     self._device = device
     super().__init__(module)
     self.tensor_id_to_dynamic_dims = {}
+    self.custom_ops_allowed_in_graph = custom_ops_allowed_in_graph
+    self.gm_serializer = gm_serializer
 
   def _mark_dynamic(self, tensor, dynamic_dims):
     tid = torch_xla._XLAC._xla_get_tensor_id(tensor)
@@ -261,8 +272,24 @@ def run_node(self, n) -> Any:
             i for i, x in enumerate(fake_t.shape) if not isinstance(x, int)
         ]
         self._mark_dynamic(res, dynamic_dims)
-      return res
-    return super().run_node(n)
+    elif n.op == 'call_function':
+      if hasattr(n.target, 'namespace'
+                ) and n.target.namespace in self.custom_ops_allowed_in_graph:
+        output_shapes, output_dtypes = extract_custom_call_outputs_shape_dtype(
+            n)
+        call_name = str(n.target)
+        n.target = stablehlo_custom_call
+        n.args = (n.args, call_name, output_shapes, output_dtypes)
+      res = super().run_node(n)
+    else:
+      res = super().run_node(n)
+    if self.gm_serializer is not None:
+      from torch_xla.experimental.xla_mlir_debuginfo import write_mlir_debuginfo
+      node_metadata = json.dumps(self.gm_serializer.serialize_metadata(n))
+      pytree.tree_map_only(torch.Tensor,
+                           lambda x: write_mlir_debuginfo(x, node_metadata),
+                           res)
+    return res
 
 
 def _extract_input_args(exported_model, options):
@@ -320,7 +347,17 @@ def _exported_program_to_stablehlo_bundle(exported_model,
   if options.inline_all_constant:
     # Inline all constants.
     torch_xla._XLAC._set_xla_all_numbers_special_scalars(True)
-  xla_interpreter = XLAExportInterpreter(exported_model.graph_module, device)
+  xla_hlo_debug_env = os.environ.get("XLA_HLO_DEBUG", "0")
+  if options.export_node_metadata:
+    gm_serializer = GraphModuleSerializer(exported_model.graph_signature,
+                                          exported_model.module_call_graph)
+    os.environ["XLA_HLO_DEBUG"] = "1"
+  else:
+    gm_serializer = None
+
+  xla_interpreter = XLAExportInterpreter(exported_model.graph_module, device,
+                                         options.custom_ops_allowed_in_graph,
+                                         gm_serializer)
   with torch.no_grad():
     res = xla_interpreter.run(*_flat_input_args, enable_io_processing=False)
     res = res[num_mutations:]
@@ -442,6 +479,8 @@ def _exported_program_to_stablehlo_bundle(exported_model,
 
   # Recover the global flag to not inline all scalars.
   torch_xla._XLAC._set_xla_all_numbers_special_scalars(False)
+  # Recover the global XLA_HLO_DEBUG flag
+  os.environ["XLA_HLO_DEBUG"] = xla_hlo_debug_env
 
   return bundle
 
diff --git a/torch_xla/torch_xla.py b/torch_xla/torch_xla.py
index 141d7e3e5a7..f5f56ba9cbc 100644
--- a/torch_xla/torch_xla.py
+++ b/torch_xla/torch_xla.py
@@ -1,4 +1,6 @@
+import contextlib
 from typing import List
+
 import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
@@ -50,3 +52,20 @@ def device_count() -> int:
 def sync():
   """Launches all pending graph operations."""
   xm.mark_step()
+
+
+@contextlib.contextmanager
+def step():
+  """Wraps code that should be dispatched to the runtime.
+
+  Experimental: `xla.step` is still a work in progress. Some code that currently
+  works with `xla.step` but does not follow best practices will become errors in
+  future releases. See https://github.com/pytorch/xla/issues/6751 for context.
+  """
+  # Clear pending operations
+  xm.mark_step()
+
+  try:
+    yield
+  finally:
+    xm.mark_step()
diff --git a/torch_xla/utils/dlpack.py b/torch_xla/utils/dlpack.py
new file mode 100644
index 00000000000..c49083e4403
--- /dev/null
+++ b/torch_xla/utils/dlpack.py
@@ -0,0 +1,36 @@
+from typing import Any
+import enum
+import torch_xla
+
+
+def to_dlpack(xla_tensor: Any):
+  return torch_xla._XLAC._to_dlpack(xla_tensor)
+
+
+class DLDeviceType(enum.IntEnum):
+  # Enums as in DLPack specification (aten/src/ATen/dlpack.h)
+  kDLCPU = 1,
+  kDLGPU = 2,
+  kDLCPUPinned = 3,
+  kDLOpenCL = 4,
+  kDLVulkan = 7,
+  kDLMetal = 8,
+  kDLVPI = 9,
+  kDLROCM = 10,
+  kDLExtDev = 12,
+  kDLOneAPI = 14,
+
+
+def from_dlpack(ext_tensor: Any):
+  if hasattr(ext_tensor, '__dlpack_device__') and hasattr(
+      ext_tensor, '__dlpack__'):
+    device_type, device_id = ext_tensor.__dlpack_device__()
+    if device_type == DLDeviceType.kDLGPU:
+      stream = torch_xla._XLAC._get_stream_for_cuda_device(device_id)
+      dlpack = ext_tensor.__dlpack__(stream=stream)
+    else:
+      dlpack = ext_tensor.__dlpack__()
+  else:
+    dlpack = ext_tensor
+
+  return torch_xla._XLAC._from_dlpack(dlpack)