diff --git a/master/_modules/index.html b/master/_modules/index.html
index 9a5a00b4731..5e714bc6cce 100644
--- a/master/_modules/index.html
+++ b/master/_modules/index.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/core/functions.html b/master/_modules/torch_xla/core/functions.html
index b8fddd07205..2ac49bf726f 100644
--- a/master/_modules/torch_xla/core/functions.html
+++ b/master/_modules/torch_xla/core/functions.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/core/xla_model.html b/master/_modules/torch_xla/core/xla_model.html
index 66afdabf169..cf8dfc0c59b 100644
--- a/master/_modules/torch_xla/core/xla_model.html
+++ b/master/_modules/torch_xla/core/xla_model.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/distributed/parallel_loader.html b/master/_modules/torch_xla/distributed/parallel_loader.html
index 1d90cff19a9..1416affa6b3 100644
--- a/master/_modules/torch_xla/distributed/parallel_loader.html
+++ b/master/_modules/torch_xla/distributed/parallel_loader.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/distributed/xla_multiprocessing.html b/master/_modules/torch_xla/distributed/xla_multiprocessing.html
index 402e36ef20c..8a21727cd26 100644
--- a/master/_modules/torch_xla/distributed/xla_multiprocessing.html
+++ b/master/_modules/torch_xla/distributed/xla_multiprocessing.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/utils/serialization.html b/master/_modules/torch_xla/utils/serialization.html
index 715bbe30653..6a952a74fc0 100644
--- a/master/_modules/torch_xla/utils/serialization.html
+++ b/master/_modules/torch_xla/utils/serialization.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/_modules/torch_xla/utils/utils.html b/master/_modules/torch_xla/utils/utils.html
index 3ca2e5663fe..663df9185ef 100644
--- a/master/_modules/torch_xla/utils/utils.html
+++ b/master/_modules/torch_xla/utils/utils.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/genindex.html b/master/genindex.html
index fdcc719fcd0..2f92981e00d 100644
--- a/master/genindex.html
+++ b/master/genindex.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/index.html b/master/index.html
index 35104452898..c7ff025ac83 100644
--- a/master/index.html
+++ b/master/index.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
@@ -400,9 +400,9 @@
 <li><a class="reference internal" href="#what-is-pytorch-xla-spmd">What is PyTorch/XLA SPMD?</a></li>
 <li><a class="reference internal" href="#pytorch-xla-spmd-design-overview">PyTorch/XLA SPMD Design Overview</a><ul>
 <li><a class="reference internal" href="#simple-example-sharding-aannotation-api">Simple Example &amp; Sharding Aannotation API</a></li>
-<li><a class="reference internal" href="#id28">Mesh</a></li>
+<li><a class="reference internal" href="#id27">Mesh</a></li>
 <li><a class="reference internal" href="#hybrid-mesh">Hybrid Mesh</a></li>
-<li><a class="reference internal" href="#id29">Partition Spec</a></li>
+<li><a class="reference internal" href="#id28">Partition Spec</a></li>
 <li><a class="reference internal" href="#xlashardedtensor">XLAShardedTensor</a></li>
 <li><a class="reference internal" href="#dtensor-integration">DTensor Integration</a></li>
 <li><a class="reference internal" href="#sharding-aware-host-to-device-data-loading">Sharding-Aware Host-to-Device Data Loading</a></li>
@@ -431,7 +431,7 @@
 <li><a class="reference internal" href="#currently-model-should-be-loaded-to-xla-device-via-distribute-module">Currently, model should be loaded to xla device via distribute_module.</a></li>
 <li><a class="reference internal" href="#fully-sharded-data-parallel-via-spmd">Fully Sharded Data Parallel via SPMD</a><ul>
 <li><a class="reference internal" href="#sharding-output">Sharding output</a></li>
-<li><a class="reference internal" href="#id51">Gradient checkpointing</a></li>
+<li><a class="reference internal" href="#id50">Gradient checkpointing</a></li>
 <li><a class="reference internal" href="#huggingface-llama-2-example">HuggingFace Llama 2 Example</a></li>
 </ul>
 </li>
@@ -3179,12 +3179,10 @@ <h3>Check environment variable<a class="headerlink" href="#check-environment-var
 </div>
 <div class="section" id="wheel">
 <h3>Wheel<a class="headerlink" href="#wheel" title="Permalink to this heading">¶</a></h3>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip3</span> <span class="n">install</span> <span class="n">torch</span><span class="o">==</span><span class="mf">2.3.0</span>
-<span class="c1"># GPU whl for python 3.10 + cuda 12.1</span>
-<span class="n">pip3</span> <span class="n">install</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">storage</span><span class="o">.</span><span class="n">googleapis</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">pytorch</span><span class="o">-</span><span class="n">xla</span><span class="o">-</span><span class="n">releases</span><span class="o">/</span><span class="n">wheels</span><span class="o">/</span><span class="n">cuda</span><span class="o">/</span><span class="mf">12.1</span><span class="o">/</span><span class="n">torch_xla</span><span class="o">-</span><span class="mf">2.3.0</span><span class="o">-</span><span class="n">cp310</span><span class="o">-</span><span class="n">cp310</span><span class="o">-</span><span class="n">manylinux_2_28_x86_64</span><span class="o">.</span><span class="n">whl</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip3</span> <span class="n">install</span> <span class="n">torch</span><span class="o">==</span><span class="mf">2.2.0</span>
+<span class="n">pip3</span> <span class="n">install</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">storage</span><span class="o">.</span><span class="n">googleapis</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">pytorch</span><span class="o">-</span><span class="n">xla</span><span class="o">-</span><span class="n">releases</span><span class="o">/</span><span class="n">wheels</span><span class="o">/</span><span class="n">cuda</span><span class="o">/</span><span class="mf">12.1</span><span class="o">/</span><span class="n">torch_xla</span><span class="o">-</span><span class="mf">2.2.0</span><span class="o">-</span><span class="n">cp38</span><span class="o">-</span><span class="n">cp38</span><span class="o">-</span><span class="n">manylinux_2_28_x86_64</span><span class="o">.</span><span class="n">whl</span>
 </pre></div>
 </div>
-<p>Wheels for other Python version and CUDA version can be found <a class="reference external" href="https://github.com/pytorch/xla?tab=readme-ov-file#available-docker-images-and-wheels">here</a>.</p>
 </div>
 </div>
 <div class="section" id="run-a-simple-model">
@@ -3261,7 +3259,7 @@ <h1>PyTorch/XLA SPMD User Guide<a class="headerlink" href="#pytorch-xla-spmd-use
 <h2>What is PyTorch/XLA SPMD?<a class="headerlink" href="#what-is-pytorch-xla-spmd" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="https://arxiv.org/abs/2105.04663">GSPMD</a> is an automatic parallelization system for common ML workloads. The XLA compiler will transform the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. This feature allows developers to write PyTorch programs as if they are on a single large device without any custom sharded computation ops and/or collective communications to scale.</p>
 <a class="reference external image-reference" href="assets/spmd_mode.png"><img alt="alt_text" src="assets/spmd_mode.png" /></a>
-<p><a href="#id25"><span class="problematic" id="id26">*</span></a><span class="raw-html-m2r"><span style="text-decoration:underline;">Figure 1. Comparison of two different execution strategies, (a) for non-SPMD and (b) for SPMD.</span></span>*</p>
+<p><a href="#id24"><span class="problematic" id="id25">*</span></a><span class="raw-html-m2r"><span style="text-decoration:underline;">Figure 1. Comparison of two different execution strategies, (a) for non-SPMD and (b) for SPMD.</span></span>*</p>
 <p>To support GSPMD in PyTorch/XLA, we are introducing a new execution mode. Before GSPMD, the execution mode in PyTorch/XLA assumed multiple model replicas, each with a single core (Figure 1.a). This mode of execution, as illustrated in the above  suits data parallelism frameworks, like the popular PyTorch <a class="reference external" href="https://pytorch.org/tutorials/intermediate/ddp_tutorial.html">Distributed Data Parallel (DDP)</a> or Fully Sharded Data Parallel (FSDP), but is also limited in that a replica can only reside on one device core for execution. PyTorch/XLA SPMD introduces a new execution mode that assumes a single replica with multiple cores (Figure 1.b), allowing a replica to run across multiple device cores. This shift unlocks more advanced parallelism strategies for better large model training performance.</p>
 <p>PyTorch/XLA SPMD is available on the new <a class="reference external" href="https://github.com/pytorch/xla/blob/master/docs/pjrt.md">PJRT</a> runtime. To enable PyTorch/XLA SPMD execution mode, the user must call <span class="raw-html-m2r"><code>[use_spmd() API](https://github.com/pytorch/xla/blob/b8b484515a97f74e013dcf38125c44d53a41f011/torch_xla/runtime.py#L214)</code></span>.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch_xla.runtime</span> <span class="k">as</span> <span class="nn">xr</span>
@@ -3330,8 +3328,8 @@ <h3>Simple Example &amp; Sharding Aannotation API<a class="headerlink" href="#si
 </div>
 <p>More complete unit test cases and integration test examples are available in the PyTorch/XLA <a class="reference external" href="https://github.com/pytorch/xla/tree/r2.0/test/spmd">repo</a>.</p>
 </div>
-<div class="section" id="id28">
-<h3>Mesh<a class="headerlink" href="#id28" title="Permalink to this heading">¶</a></h3>
+<div class="section" id="id27">
+<h3>Mesh<a class="headerlink" href="#id27" title="Permalink to this heading">¶</a></h3>
 <p>For a given cluster of devices, a physical mesh is a representation of the interconnect topology.</p>
 <p>We derive a logical mesh based on this topology to create sub-groups of devices which can be used for partitioning different axes of tensors in a model.</p>
 <a class="reference external image-reference" href="assets/mesh_spmd2.png"><img alt="alt_text" src="assets/mesh_spmd2.png" /></a>
@@ -3375,8 +3373,8 @@ <h3>Hybrid Mesh<a class="headerlink" href="#hybrid-mesh" title="Permalink to thi
 </pre></div>
 </div>
 </div>
-<div class="section" id="id29">
-<h3>Partition Spec<a class="headerlink" href="#id29" title="Permalink to this heading">¶</a></h3>
+<div class="section" id="id28">
+<h3>Partition Spec<a class="headerlink" href="#id28" title="Permalink to this heading">¶</a></h3>
 <p>partition_spec has the same rank as the input tensor. Each dimension describes how the corresponding input tensor dimension is sharded across the device mesh (logically defined by mesh_shape). <code class="docutils literal notranslate"><span class="pre">partition_spec</span></code> is a tuple of <code class="docutils literal notranslate"><span class="pre">device_mesh</span></code> dimension <code class="docutils literal notranslate"><span class="pre">index</span></code> or None. The index can be an <code class="docutils literal notranslate"><span class="pre">int</span></code> or <code class="docutils literal notranslate"><span class="pre">str</span></code>, if the corresponding mesh dimension is named. This specifies how each input rank is sharded (<code class="docutils literal notranslate"><span class="pre">index</span></code> to <code class="docutils literal notranslate"><span class="pre">mesh_shape</span></code>) or replicated (<code class="docutils literal notranslate"><span class="pre">None</span></code>).</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Provide optional mesh axis names and use them in the partition spec</span>
 <span class="n">mesh</span> <span class="o">=</span> <span class="n">Mesh</span><span class="p">(</span><span class="n">device_ids</span><span class="p">,</span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">2</span><span class="p">),</span> <span class="p">(</span><span class="s1">&#39;data&#39;</span><span class="p">,</span> <span class="s1">&#39;model&#39;</span><span class="p">))</span>
@@ -3671,7 +3669,7 @@ <h3>SPMD Debugging Tool<a class="headerlink" href="#spmd-debugging-tool" title="
 <p>We provide a <code class="docutils literal notranslate"><span class="pre">shard</span> <span class="pre">placement</span> <span class="pre">visualization</span> <span class="pre">debug</span> <span class="pre">tool</span></code> for PyTorch/XLA SPMD user on TPU/GPU/CPU with single-host/multi-host: you could use <code class="docutils literal notranslate"><span class="pre">visualize_tensor_sharding</span></code> to visualize sharded tensor, or you could use <code class="docutils literal notranslate"><span class="pre">visualize_sharding</span></code> to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with <code class="docutils literal notranslate"><span class="pre">visualize_tensor_sharding</span></code> or <code class="docutils literal notranslate"><span class="pre">visualize_sharding</span></code>:</p>
 <ul class="simple">
 <li><p>Code snippet used <code class="docutils literal notranslate"><span class="pre">visualize_tensor_sharding</span></code> and visualization result:
-<a href="#id36"><span class="problematic" id="id37">``</span></a><a href="#id38"><span class="problematic" id="id39">`</span></a>python
+<a href="#id35"><span class="problematic" id="id36">``</span></a><a href="#id37"><span class="problematic" id="id38">`</span></a>python
 import rich</p></li>
 </ul>
 </div>
@@ -3708,7 +3706,7 @@ <h1>A tensor’s sharding can be visualized using the <code class="docutils lite
 </div></blockquote>
 </li>
 <li><p>Calling <code class="docutils literal notranslate"><span class="pre">pytorch.distributed._tensor.distribute_module</span></code> with <code class="docutils literal notranslate"><span class="pre">auto-policy</span></code> and <code class="docutils literal notranslate"><span class="pre">xla</span></code>:
-<a href="#id41"><span class="problematic" id="id42">``</span></a><a href="#id43"><span class="problematic" id="id44">`</span></a>python
+<a href="#id40"><span class="problematic" id="id41">``</span></a><a href="#id42"><span class="problematic" id="id43">`</span></a>python
 import torch_xla.runtime as xr
 from torch.distributed._tensor import DeviceMesh, distribute_module
 from torch_xla.distributed.spmd import auto_policy</p></li>
@@ -3720,7 +3718,7 @@ <h1>A tensor’s sharding can be visualized using the <code class="docutils lite
 <h1>Currently, model should be loaded to xla device via distribute_module.<a class="headerlink" href="#currently-model-should-be-loaded-to-xla-device-via-distribute-module" title="Permalink to this heading">¶</a></h1>
 <p>model = MyModule()  # nn.module
 sharded_model = distribute_module(model, device_mesh, auto_policy)
-<a href="#id45"><span class="problematic" id="id46">``</span></a><a href="#id47"><span class="problematic" id="id48">`</span></a></p>
+<a href="#id44"><span class="problematic" id="id45">``</span></a><a href="#id46"><span class="problematic" id="id47">`</span></a></p>
 <p>Optionally, one can set the following options/env-vars to control the behvaior of
 the XLA-based auto-sharding pass:</p>
 <ul class="simple">
@@ -3778,8 +3776,8 @@ <h2>Sharding output<a class="headerlink" href="#sharding-output" title="Permalin
 </pre></div>
 </div>
 </div>
-<div class="section" id="id51">
-<h2>Gradient checkpointing<a class="headerlink" href="#id51" title="Permalink to this heading">¶</a></h2>
+<div class="section" id="id50">
+<h2>Gradient checkpointing<a class="headerlink" href="#id50" title="Permalink to this heading">¶</a></h2>
 <p>Currently, gradient checkpointing needs to be applied to the module before the FSDP wrapper. Otherwise, recursively loop into children modules will end up with infinite loop. We will fix this issue in the future releases.</p>
 <p>Example usage:</p>
 <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">torch_xla.distributed.fsdp</span> <span class="kn">import</span> <span class="n">checkpoint_module</span>
@@ -3971,9 +3969,9 @@ <h2>HuggingFace Llama 2 Example<a class="headerlink" href="#huggingface-llama-2-
 <li><a class="reference internal" href="#what-is-pytorch-xla-spmd">What is PyTorch/XLA SPMD?</a></li>
 <li><a class="reference internal" href="#pytorch-xla-spmd-design-overview">PyTorch/XLA SPMD Design Overview</a><ul>
 <li><a class="reference internal" href="#simple-example-sharding-aannotation-api">Simple Example &amp; Sharding Aannotation API</a></li>
-<li><a class="reference internal" href="#id28">Mesh</a></li>
+<li><a class="reference internal" href="#id27">Mesh</a></li>
 <li><a class="reference internal" href="#hybrid-mesh">Hybrid Mesh</a></li>
-<li><a class="reference internal" href="#id29">Partition Spec</a></li>
+<li><a class="reference internal" href="#id28">Partition Spec</a></li>
 <li><a class="reference internal" href="#xlashardedtensor">XLAShardedTensor</a></li>
 <li><a class="reference internal" href="#dtensor-integration">DTensor Integration</a></li>
 <li><a class="reference internal" href="#sharding-aware-host-to-device-data-loading">Sharding-Aware Host-to-Device Data Loading</a></li>
@@ -4002,7 +4000,7 @@ <h2>HuggingFace Llama 2 Example<a class="headerlink" href="#huggingface-llama-2-
 <li><a class="reference internal" href="#currently-model-should-be-loaded-to-xla-device-via-distribute-module">Currently, model should be loaded to xla device via distribute_module.</a></li>
 <li><a class="reference internal" href="#fully-sharded-data-parallel-via-spmd">Fully Sharded Data Parallel via SPMD</a><ul>
 <li><a class="reference internal" href="#sharding-output">Sharding output</a></li>
-<li><a class="reference internal" href="#id51">Gradient checkpointing</a></li>
+<li><a class="reference internal" href="#id50">Gradient checkpointing</a></li>
 <li><a class="reference internal" href="#huggingface-llama-2-example">HuggingFace Llama 2 Example</a></li>
 </ul>
 </li>
diff --git a/master/notes/source_of_recompilation.html b/master/notes/source_of_recompilation.html
index 2569ca56081..249df2ea0c8 100644
--- a/master/notes/source_of_recompilation.html
+++ b/master/notes/source_of_recompilation.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/objects.inv b/master/objects.inv
index bd299214443..130f1e9c7da 100644
Binary files a/master/objects.inv and b/master/objects.inv differ
diff --git a/master/py-modindex.html b/master/py-modindex.html
index 4fc717b4f67..3e14afeaf38 100644
--- a/master/py-modindex.html
+++ b/master/py-modindex.html
@@ -230,7 +230,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/search.html b/master/search.html
index 5ebda9ad1d7..2aa5d503250 100644
--- a/master/search.html
+++ b/master/search.html
@@ -227,7 +227,7 @@
               
               
                 <div class="version">
-                  master (2.4.0+gitabe090a )
+                  master (2.4.0+git0a204a6 )
                 </div>
               
             
diff --git a/master/searchindex.js b/master/searchindex.js
index 088962ebf04..f088fb406a0 100644
--- a/master/searchindex.js
+++ b/master/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["index", "notes/source_of_recompilation"], "filenames": ["index.rst", "notes/source_of_recompilation.md"], "titles": ["PyTorch on XLA Devices", "Source of recompilations in torch_xla"], "terms": {"like": [0, 1], "torch_xla": 0, "packag": 0, "thi": [0, 1], "document": 0, "describ": 0, "your": [0, 1], "add": [0, 1], "type": 0, "work": [0, 1], "just": [0, 1], "other": [0, 1], "For": [0, 1], "print": [0, 1], "import": 0, "core": 0, "t": [0, 1], "randn": 0, "xla_devic": 0, "look": 0, "familiar": 0, "same": [0, 1], "interfac": 0, "regular": 0, "few": [0, 1], "addit": 0, "initi": 0, "return": [0, 1], "mai": [0, 1], "depend": [0, 1], "oper": 0, "cuda": [0, 1], "ad": [0, 1], "togeth": 0, "t0": 0, "t1": 0, "Or": [0, 1], "matrix": 0, "multipli": 0, "mm": 0, "neural": 0, "network": 0, "modul": 0, "l_in": 0, "linear": 0, "nn": 0, "20": 0, "l_out": 0, "onli": [0, 1], "so": [0, 1], "input": 0, "floattensor": 0, "throw": 0, "error": 0, "sinc": [0, 1], "exist": 0, "one": [0, 1], "requir": [0, 1], "line": [0, 1], "specif": 0, "follow": [0, 1], "snippet": 0, "highlight": 0, "when": 0, "show": 0, "loss_fn": 0, "nllloss": 0, "sgd": 0, "lr": 0, "momentum": 0, "target": [0, 1], "train_load": 0, "zero_grad": 0, "loss": 0, "backward": 0, "step": [0, 1], "mark_step": 0, "easi": [0, 1], "switch": [0, 1], "definit": [0, 1], "dataload": 0, "loop": [0, 1], "ani": [0, 1], "coupl": 0, "acquir": 0, "mark": 0, "call": [0, 1], "end": 0, "each": [0, 1], "iter": 0, "caus": [0, 1], "its": 0, "graph": [0, 1], "updat": [0, 1], "see": [0, 1], "make": [0, 1], "acceler": 0, "parallel_load": 0, "pl": 0, "xla_multiprocess": 0, "xmp": 0, "def": 0, "_mp_fn": 0, "index": 0, "mp_device_load": 0, "mpdeviceload": 0, "optimizer_step": 0, "__name__": 0, "__main__": 0, "spawn": 0, "arg": 0, "There": [0, 1], "three": 0, "between": [0, 1], "previou": [0, 1], "let": 0, "go": 0, "over": 0, "abl": [0, 1], "access": [0, 1], "assign": 0, "v4": 0, "4": [0, 1], "being": 0, "up": [0, 1], "own": 0, "note": [0, 1], "you": 0, "0": [0, 1], "all": [0, 1], "becaus": 0, "doe": [0, 1], "mean": [0, 1], "function": 0, "have": [0, 1], "thread": 0, "doc": [0, 1], "onto": 0, "wrap": 0, "It": [0, 1], "preload": 0, "overlap": 0, "improv": 0, "also": [0, 1], "everi": [0, 1], "batches_per_execut": 0, "default": 0, "batch": 0, "yield": 0, "consolid": 0, "issu": 0, "comput": [0, 1], "pretti": [0, 1], "much": [0, 1], "all_reduce_gradi": 0, "reduc": 0, "remain": [0, 1], "start": 0, "retriev": [0, 1], "within": 0, "which": [0, 1], "ha": [0, 1], "parent": 0, "stack": [0, 1], "full": 0, "multiprocess": 0, "veri": [0, 1], "talk": 0, "about": [0, 1], "independ": 0, "bit": 0, "avail": [0, 1], "13": 0, "releas": 0, "befor": [0, 1], "pleas": 0, "our": [0, 1], "explain": [0, 1], "googl": 0, "cloud": 0, "basi": 0, "gcloud": 0, "command": 0, "project": 0, "howto": 0, "focu": [0, 1], "perspect": 0, "assum": [0, 1], "abov": [0, 1], "section": 0, "train_mnist_xla": 0, "py": 0, "If": [0, 1], "would": [0, 1], "ssh": 0, "tpuvm": 0, "pjrt_devic": 0, "python3": 0, "now": [0, 1], "order": 0, "16": 0, "need": [0, 1], "sure": 0, "usual": 0, "done": [0, 1], "scp": 0, "copi": 0, "time": [0, 1], "alpha": 0, "vm": 0, "zone": 0, "worker": 0, "outsid": 0, "underli": 0, "infrastructur": 0, "global": 0, "topolog": 0, "local": 0, "ordin": 0, "cross": 0, "commun": 0, "happen": [0, 1], "across": 0, "instead": [0, 1], "regard": 0, "inform": [0, 1], "complet": 0, "fakedata": 0, "But": [0, 1], "even": [0, 1], "though": 0, "act": 0, "lot": [0, 1], "intern": [0, 1], "uniqu": [0, 1], "launch": 0, "immedi": 0, "eagerli": [0, 1], "hand": 0, "thei": [0, 1], "record": 0, "until": 0, "result": 0, "defer": 0, "separ": 0, "might": [0, 1], "fuse": 0, "gener": [0, 1], "invis": 0, "caller": 0, "construct": 0, "send": 0, "them": [0, 1], "synchron": 0, "insert": 0, "barrier": 0, "explicitli": [0, 1], "paper": 0, "datatyp": 0, "In": [0, 1], "fact": 0, "handl": [0, 1], "float": [0, 1], "doubl": [0, 1], "behavior": 0, "control": 0, "xla_use_bf16": 0, "xla_downcast_bf16": 0, "By": [0, 1], "both": [0, 1], "set": [0, 1], "float32": 0, "directli": [0, 1], "map": 0, "bf16": 0, "primit": 0, "alwai": [0, 1], "regardless": 0, "actual": [0, 1], "re": [0, 1], "convers": [0, 1], "opaqu": 0, "back": 0, "trigger": [0, 1], "represent": 0, "expos": 0, "storag": 0, "appear": 0, "contigu": 0, "unlik": 0, "allow": 0, "adjust": 0, "better": [0, 1], "view": 0, "relationship": 0, "preserv": 0, "put": 0, "anoth": [0, 1], "wai": [0, 1], "onc": [0, 1], "again": 0, "appreci": 0, "accommod": 0, "transit": 0, "pt": 0, "were": [0, 1], "per": 0, "care": [0, 1], "must": 0, "taken": [0, 1], "recommend": 0, "recreat": 0, "after": [0, 1], "been": [0, 1], "destin": 0, "provid": [0, 1], "previous": 0, "state_dict": 0, "path": [0, 1], "case": 0, "master": 0, "where": [0, 1], "limit": 0, "compar": 0, "size": [0, 1], "footprint": 0, "serial": 0, "xser": 0, "stream": 0, "amount": [0, 1], "match": 0, "restor": 0, "load_state_dict": 0, "possibl": 0, "unavail": 0, "fail": 0, "under": 0, "activ": 0, "futur": [0, 1], "trace": [0, 1], "hlo": 0, "consum": [0, 1], "doesn": [0, 1], "persist": 0, "disk": 0, "reus": 0, "significantli": 0, "recompil": 0, "still": [0, 1], "occur": 0, "experiment": 0, "opt": 0, "through": [0, 1], "initialize_cach": 0, "xr": 0, "your_cache_path": 0, "readonli": 0, "fals": 0, "specifi": 0, "whether": 0, "write": 0, "share": 0, "mount": 0, "workload": 0, "n": 0, "none": 0, "devkind": 0, "given": [0, 1], "python": [0, 1], "int": [0, 1], "option": 0, "otherwis": [0, 1], "first": 0, "string": 0, "custom": [0, 1], "deprec": 0, "request": [0, 1], "get_xla_supported_devic": 0, "max_devic": 0, "list": 0, "kind": 0, "name": [0, 1], "maximum": 0, "xla_device_hw": 0, "hardwar": 0, "get_ordin": 0, "defval": 0, "replic": 0, "rang": 0, "xrt_world_siz": 0, "minu": 0, "valu": [0, 1], "ignor": 0, "get_local_ordin": 0, "is_master_ordin": 0, "true": [0, 1], "bool": 0, "while": [0, 1], "num_host": 0, "boolean": 0, "indic": [0, 1], "part": 0, "all_reduc": 0, "reduce_typ": 0, "scale": 0, "pin_layout": 0, "inplac": 0, "One": 0, "reduce_sum": 0, "reduce_mul": 0, "reduce_and": 0, "reduce_or": 0, "reduce_min": 0, "reduce_max": 0, "either": [0, 1], "op": [0, 1], "appli": 0, "repres": [0, 1], "replica": 0, "3": 0, "5": [0, 1], "6": [0, 1], "7": 0, "defin": 0, "two": [0, 1], "pin": 0, "pine": 0, "prevent": 0, "potenti": 0, "corrupt": 0, "particip": 0, "slightli": 0, "program": [0, 1], "unpin": 0, "messag": 0, "hlomodul": 0, "constrain": 0, "pass": 0, "hold": 0, "tupl": [0, 1], "itself": 0, "all_gath": 0, "dim": 0, "gather": 0, "along": 0, "dimens": 0, "all_to_al": 0, "split_dimens": 0, "concat_dimens": 0, "split_count": 0, "alltoal": 0, "http": 0, "www": 0, "tensorflow": [0, 1], "org": 0, "operation_semant": 0, "upon": 0, "split": 0, "concat": 0, "count": 0, "add_step_closur": 0, "closur": 0, "run_async": 0, "ones": [0, 1], "mani": [0, 1], "dure": 0, "consol": 0, "post": 0, "tensorboard": 0, "etc": [0, 1], "content": 0, "intermediari": 0, "inspect": 0, "point": [0, 1], "typic": 0, "ensur": [0, 1], "live": [0, 1], "alreadi": [0, 1], "materi": [0, 1], "includ": [0, 1], "captur": 0, "argument": 0, "queu": 0, "sequenti": 0, "advis": 0, "throttl": 0, "event": 0, "callabl": 0, "asynchron": 0, "wait_device_op": 0, "wait": 0, "async": 0, "whose": 0, "empti": 0, "optimizer_arg": 0, "parallelload": 0, "dataparallel": 0, "necessari": 0, "loader": 0, "next": [0, 1], "dict": 0, "dictionari": 0, "file_or_path": 0, "master_onli": 0, "global_mast": 0, "file": 0, "transfer": 0, "nest": 0, "object": 0, "overrid": 0, "locat": 0, "flag": 0, "sync": 0, "main": 0, "hang": 0, "tag": 0, "payload": 0, "b": [0, 1], "client": 0, "reach": 0, "server": 0, "effect": 0, "alia": 0, "xla_rendezv": 0, "join": 0, "byte": 0, "sent": 0, "exchang": 0, "i": [0, 1], "posit": 0, "do_on_ordin": 0, "contain": [0, 1], "interpret": [0, 1], "ran": 0, "mesh_reduc": 0, "reduce_fn": 0, "out": 0, "reduct": 0, "receiv": 0, "come": [0, 1], "set_rng_stat": 0, "seed": 0, "random": 0, "state": 0, "integ": 0, "rng": 0, "miss": 0, "get_rng_stat": 0, "get_memory_info": 0, "kb_free": 0, "free": 0, "kb": 0, "kb_total": 0, "total": [0, 1], "kei": 0, "get_stablehlo": 0, "str": 0, "stablehlo": 0, "format": 0, "dump": 0, "whole": [0, 1], "todo": 0, "lsy323": 0, "intermedi": 0, "investig": 0, "straightforward": 0, "identifi": 0, "To": [0, 1], "enabl": 0, "info": [0, 1], "env": 0, "var": 0, "xla_hlo_debug": 0, "root": [0, 1], "get_stablehlo_bytecod": 0, "bytecod": 0, "autograd": 0, "differenti": 0, "select": 0, "class": 0, "batchdim": 0, "loader_prefetch_s": 0, "device_prefetch_s": 0, "host_to_device_transfer_thread": 0, "input_shard": 0, "upload": 0, "th": 0, "sampl": 0, "len": 0, "max": [0, 1], "capac": 0, "queue": 0, "deposit": 0, "shardingspec": 0, "compat": 0, "per_device_load": 0, "structur": 0, "resid": 0, "fn": 0, "nproc": 0, "daemon": 0, "start_method": 0, "base": [0, 1], "At": 0, "moment": 0, "block": 0, "creation": 0, "mpmodelwrapp": 0, "minim": 0, "usag": [0, 1], "fork": 0, "henc": 0, "scope": 0, "insid": 0, "wrapped_model": 0, "mynetwork": 0, "advantag": 0, "page": 0, "origin": 0, "weight": 0, "second": 0, "lower": [0, 1], "system": 0, "mpserialexecutor": 0, "fashion": 0, "among": 0, "serial_exec": 0, "load_dataset": 0, "maybe_download_and_load": 0, "avoid": 0, "download": 0, "executor": 0, "dataset": 0, "lambda": 0, "tmp": 0, "wrt": 0, "samplegener": 0, "sample_count": 0, "place": 0, "synthet": 0, "datawrapp": 0, "illustr": 0, "e": [0, 1], "g": [0, 1], "solut": [0, 1], "howev": 0, "serv": 0, "brief": 0, "help": [0, 1], "reader": 0, "modif": 0, "supplement": 0, "fetch": 0, "ir": [0, 1], "encount": 0, "discuss": 0, "below": [0, 1], "lazytensor": 0, "machin": 0, "readabl": 0, "opcod": 0, "effici": 0, "fed": 0, "later": 0, "except": 0, "four": 0, "slice": 0, "attach": 0, "shape": 0, "evalu": [0, 1], "sometim": [0, 1], "lead": 0, "signific": 0, "log": 0, "callback": 0, "slower": 0, "xla_tensor_z": 0, "respons": 0, "These": 0, "cut": [0, 1], "transferfromdevic": 0, "special": 0, "instruct": 0, "tell": [0, 1], "break": [0, 1], "properti": [0, 1], "suppos": 0, "we": [0, 1], "tensors_on_devic": 0, "z": [0, 1], "without": 0, "subgraph": [0, 1], "correspond": 0, "introduc": 0, "smaller": [0, 1], "signal": 0, "far": 0, "submit": 0, "seen": 0, "small": [0, 1], "bottleneck": 0, "dynam": 0, "suitabl": 0, "pad": [0, 1], "fix": 0, "spent": 0, "trade": [0, 1], "off": 0, "consid": 0, "larg": [0, 1], "spend": 0, "fusion": 0, "long": [0, 1], "faster": [0, 1], "due": 0, "worth": [0, 1], "As": [0, 1], "could": [0, 1], "latter": 0, "job": 0, "imag": [0, 1], "nightli": 0, "runtime_vers": 0, "export": 0, "central2": 0, "project_id": 0, "id": 0, "accelerator_typ": 0, "32": 0, "tpu_nam": 0, "your_tpu_nam": 0, "subnetwork": 0, "tpusubnet": 0, "similar": 0, "pip3": 0, "googleapi": 0, "com": 0, "cp38": 0, "linux_x86_64": 0, "whl": 0, "librari": 0, "sudo": 0, "apt": 0, "libopenbla": 0, "dev": 0, "libgl1": 0, "guidelin": 0, "modifi": 0, "replac": 0, "remov": 0, "progress": 0, "bar": 0, "rememb": 0, "someth": 0, "txt2img": 0, "prompt": 0, "photograph": 0, "astronaut": 0, "ride": 0, "hors": 0, "diff": 0, "found": 0, "relat": 0, "written": 0, "precision_scop": 0, "addition": 0, "particular": 0, "configur": 0, "frozenclipembedd": 0, "therefor": 0, "well": [0, 1], "simplic": [0, 1], "tutori": 0, "self": 0, "ddim": 0, "schedul": 0, "top": 0, "attr": 0, "statement": [0, 1], "disabl": 0, "stop": 0, "fall": 0, "especi": 0, "slow": 0, "tri": 0, "huge": [0, 1], "difficult": 0, "degrad": 0, "easier": [0, 1], "readi": 0, "reason": 0, "cover": 0, "sd": 0, "xl": 0, "git": 0, "github": 0, "text_to_imag": 0, "inference_tpu_single_devic": 0, "speed": [0, 1], "inference_tpu_multidevic": 0, "warn": 0, "watch": 0, "made": [0, 1], "lora": 0, "model_id": 0, "stabilityai": 0, "9": [0, 1], "pipelin": 0, "dpmsolvermultistepschedul": 0, "pip": 0, "r": 0, "txt": 0, "invisible_watermark": 0, "transform": 0, "safetensor": 0, "agre": 0, "licens": 0, "card": 0, "account": 0, "token": 0, "cli": 0, "login": 0, "_your_copied_token__": 0, "readm": 0, "pipe": 0, "hour": 0, "wherea": 0, "likewis": 0, "gpt": 0, "15": 0, "min": 0, "epoch": 0, "becom": [0, 1], "subsequ": 0, "mayb": 0, "try": [0, 1], "notic": 0, "piec": 0, "__call__": 0, "commit": 0, "nonzero": [0, 1], "item": 0, "desir": 0, "down": 0, "rule": 0, "thumb": 0, "fit": 0, "durat": 0, "enough": [0, 1], "least": 0, "good": [0, 1], "constantli": 0, "idl": 0, "inference_tpu_": 0, "capture_profil": 0, "give": 0, "xp": 0, "measur": 0, "u": 0, "net": 0, "analyz": 0, "cannot": [0, 1], "portion": 0, "busi": 0, "middl": [0, 1], "scroll": 0, "find": 0, "occupi": 0, "demonstr": 0, "displai": 0, "largest": 0, "appropri": 0, "zoom": 0, "timelin": 0, "period": 0, "examin": 0, "thing": 0, "did": 0, "pipe_watermark": 0, "closer": 0, "preced": 0, "finish": 0, "proceed": 0, "watermark": 0, "numpi": 0, "arrai": 0, "cv2": 0, "pywt": 0, "leav": 0, "broken": 0, "rewrit": [0, 1], "those": 0, "rerun": 0, "wa": [0, 1], "scale_model_input": 0, "overal": 0, "my_funct": 0, "Then": 0, "your_script": 0, "magic": [0, 1], "treat": 0, "constant": 0, "xla_no_special_scalar": 0, "subject": 0, "softwar": 0, "peculiar": 0, "implement": [0, 1], "depth": 0, "want": [0, 1], "detial": 0, "__version__": 0, "cu121": 0, "100": 0, "t2": 0, "200": 0, "300": 0, "test_train_mp_imagenet": 0, "fake_data": 0, "branch": [0, 1], "rx": 0, "r2": 0, "conclud": 0, "correctli": 0, "diagnos": 0, "counter": 0, "extrem": 0, "bug": 0, "pt_xla_debug": 0, "xla_dynamo_debug": 0, "summari": 0, "compiletim": 0, "too": [0, 1], "frequent": 0, "21": 0, "11": [0, 1], "transferfromdevicetim": 0, "aten": [0, 1], "_ctc_loss": 0, "_ctc_loss_backward": 0, "open": 0, "23": 0, "12": 0, "hash": 0, "537d4b0264b029688281412214d252e9": 0, "588": 0, "320": 0, "frame": 0, "workspac": 0, "dk2": 0, "840": 0, "broadcast_master_param": 0, "1230": 0, "train_imagenet": 0, "261": 0, "365": 0, "_intern": 0, "176": 0, "_thread_fn": 0, "70": 0, "usr": 0, "lib": 0, "concurr": 0, "57": 0, "_worker": 0, "80": 0, "manual": 0, "exit": 0, "steptrac": 0, "region": 0, "decid": [0, 1], "often": [0, 1], "expect": [0, 1], "frequenc": 0, "pair": 0, "stabil": 0, "keep": [0, 1], "met": 0, "short": [0, 1], "short_metrics_report": 0, "metrics_report": 0, "destroi": 0, "term": [0, 1], "percentil": 0, "totalsampl": 0, "202": 0, "06m09s401ms746": 0, "001u": 0, "valuer": 0, "778ms572": 0, "062u": 0, "rate": 0, "425201": 0, "001ms32": 0, "778u": 0, "001ms61": 0, "283u": 0, "001ms79": 0, "236u": 0, "001ms110": 0, "973u": 0, "001ms228": 0, "773u": 0, "001ms339": 0, "183u": 0, "90": 0, "001ms434": 0, "305u": 0, "95": 0, "002ms921": 0, "063u": 0, "99": 0, "21s102ms853": 0, "173u": 0, "track": 0, "statu": 0, "cachedsynctensor": 0, "395": 0, "context": [0, 1], "area": 0, "rout": 0, "engin": 0, "qualifi": 0, "c": [0, 1], "namespac": 0, "33": 0, "than": [0, 1], "_local_scalar_dens": 0, "feel": 0, "clear_al": 0, "resourc": 0, "offici": 0, "colab": 0, "notebook": 0, "behav": 0, "semant": [0, 1], "constraint": 0, "suggest": 0, "certain": [0, 1], "pattern": [0, 1], "bad": 0, "mind": 0, "expens": [0, 1], "speedup": 0, "rest": [0, 1], "direct": 0, "indirect": 0, "mask": [0, 1], "thu": 0, "low": 0, "variat": 0, "don": [0, 1], "nativ": 0, "translat": 0, "slowdown": 0, "ask": [0, 1], "unless": [0, 1], "most": 0, "checkout": 0, "scalar": [0, 1], "substitut": 0, "flow": 0, "applic": 0, "clip_grad": 0, "norm": 0, "problemat": 0, "impact": [0, 1], "patch": 0, "clip_grad_norm_": 0, "dramat": 0, "els": [0, 1], "total_norm": 0, "zero": 0, "p": [0, 1], "param_norm": 0, "grad": 0, "norm_typ": 0, "add_": 0, "clip_coef": 0, "max_norm": 0, "1e": 0, "mul_": 0, "data_parallel": 0, "drop": 0, "last": 0, "stride": 0, "reconstruct": 0, "shallow": 0, "ty": 0, "_xlac": [0, 1], "_get_xla_tensors_text": [0, 1], "_get_xla_tensors_hlo": 0, "prior": 0, "degre": 0, "xla_ir_debug": 0, "_xla_ir": 0, "propag": 0, "metadata": 0, "xla_save_tensors_fil": 0, "realli": [0, 1], "big": [0, 1], "left": 0, "append": 0, "clean": 0, "sheet": 0, "xla_save_tensors_fmt": 0, "store": 0, "_xla_save_tensor": 0, "text": 0, "dot": 0, "graphviz": 0, "xla_flag": 0, "xla_dump_to": 0, "dir_nam": 0, "unoptim": 0, "optimz": 0, "xla_metrics_fil": 0, "xla_save_hlo_fil": 0, "offend": 0, "xla_sync_wait": 0, "forc": [0, 1], "xla_use_eager_debug_mod": 0, "bypass": 0, "higher": 0, "optimizaiton": 0, "skip": 0, "bifloat16": 0, "arithmet": 0, "accur": 0, "accumul": 0, "4096": 0, "dtype": [0, 1], "4097": 0, "averag": 0, "stai": 0, "fp32": 0, "xla_use_f16": 0, "float16": 0, "half": 0, "tf_cpp_log_thread_id": 0, "tf": [0, 1], "tf_cpp_vmodul": 0, "vlog": 0, "form": [0, 1], "tf_cpp_min_log_level": 0, "turn": 0, "tf_vlog": 0, "xla_dump_hlo_graph": 0, "rais": 0, "xla_util": 0, "cc": 0, "save1": 0, "xla_graph_executor": 0, "pjrt_computation_cli": 0, "pr": 0, "dir": 0, "pytorch_test_with_slow": 0, "test_torch": 0, "k": 0, "test_put_xla_uint8": 0, "torch_test_devic": 0, "pytorch_test_bas": 0, "migrat": 0, "jax": 0, "public": 0, "init": 0, "renam": 0, "regist": 0, "xla_backend": 0, "torchrun": 0, "init_method": 0, "plugin": 0, "xpu": 0, "neuron": 0, "continu": 0, "xrt_tpu_config": 0, "libtpu": 0, "30": 0, "thousand": 0, "preview": 0, "On": 0, "safe": 0, "broadcast": 0, "manual_se": 0, "pjrt_backend": 0, "os": 0, "ddp": 0, "dist": 0, "init_process_group": 0, "rank": 0, "world_siz": 0, "42": 0, "128": 0, "gradient_as_bucket_view": 0, "mseloss": 0, "001": 0, "confirm": 0, "localservic": 0, "localhost": 0, "51011": 0, "master_addr": 0, "master_port": 0, "12355": 0, "overhead": 0, "grpc": 0, "torchbench": 0, "observ": 0, "35": 0, "2048": 0, "chip": 0, "learn": 0, "test_train_mp_mnist": 0, "batch_siz": 0, "256": 0, "num_epoch": 0, "tpu_process_bound": 0, "tpu_visible_chip": 0, "r1": 0, "preinstal": 0, "docker_imag": 0, "gcr": 0, "io": 0, "authent": 0, "privat": 0, "gcp": 0, "repositori": 0, "auth": 0, "rm": 0, "privileg": 0, "simpli": 0, "gpu_num_devic": 0, "nnode": 0, "num_gpu_devic": 0, "pjrt_distribut": 0, "physic": 0, "number_gpu_vm": 0, "node_rank": 0, "current_node_rank": 0, "nproc_per_nod": 0, "number_local_gpu_devic": 0, "rdzv_endpoint": 0, "internal_ip_address": 0, "port": 0, "multinode_train": 0, "endpoint": 0, "ip": 0, "address": 0, "omit": 0, "machine_0": 0, "machine_1": 0, "machine_0_internal_ip_address": 0, "ident": 0, "although": [0, 1], "mostli": 0, "interchang": 0, "subtl": 0, "importantli": 0, "around": [0, 1], "architectur": 0, "latenc": 0, "deseri": 0, "gain": 0, "interact": 0, "normal": [0, 1], "plan": 0, "simpler": 0, "xla_dist": 0, "sdk": 0, "reimplement": 0, "collect": 0, "enhanc": 0, "substanti": 0, "vari": [0, 1], "consist": 0, "servic": 0, "connect": 0, "practic": [0, 1], "unreli": 0, "inbound": 0, "entir": 0, "restart": 0, "impos": 0, "unwant": 0, "permit": 0, "subset": 0, "old": 0, "alter": 0, "html": 0, "_": 0, "all_gather_object": 0, "gloo": 0, "backend": [0, 1], "new_group": 0, "subgroup": 0, "experi": 0, "altern": [0, 1], "less": [0, 1], "reliabl": 0, "blob": 0, "md": 0, "strongli": 0, "world": [0, 1], "queri": 0, "_all_gath": 0, "No": [0, 1], "int32": 0, "zeros_lik": 0, "get_world_s": 0, "task": 0, "175": 0, "chart": 0, "breakdown": 0, "tfrt": 0, "legaci": 0, "streamexecutor": 0, "tpu_legaci": 0, "shown": [0, 1], "vs": [0, 1], "comparison": 0, "jit": 0, "unmodifi": 0, "hook": 0, "biggest": 0, "right": [0, 1], "bridg": 0, "torchfx": 0, "recogn": [0, 1], "technolog": 0, "fx": 0, "openxla": 0, "a_xla": 0, "b_xla": 0, "compiled_cod": 0, "resnet18": 0, "torchvis": 0, "eval_model": 0, "xla_resnet18": 0, "eval": 0, "dynamo_resnet18": 0, "no_grad": 0, "resent18": 0, "binari": 0, "invok": 0, "bench": 0, "59": 0, "64": 0, "resnext50_32x4d": 0, "91": 0, "alexnet": 0, "28": 0, "mobilenet_v2": 0, "18": 0, "62": 0, "mnasnet1_0": 0, "68": 0, "vgg16": 0, "bert_pytorch": 0, "49": 0, "squeezenet1_1": 0, "29": 0, "timm_vision_transform": 0, "52": 0, "geomean": 0, "04": 0, "team": 0, "train_model": 0, "crossentropyloss": 0, "pred": 0, "train_model_main": 0, "dynamo_train_model": 0, "xla_optim": 0, "weight_decai": 0, "extract": 0, "07": 0, "43": 0, "19": 0, "81": 0, "87": 0, "41": 0, "fwd": 0, "bwd": 0, "e2": 0, "easili": [0, 1], "hide": 0, "cost": 0, "scenario": 0, "larger": 0, "forward": 0, "best": 0, "ideal": [0, 1], "promis": 0, "complex": 0, "tradit": 0, "hard": [0, 1], "expand": 0, "mention": [0, 1], "super": 0, "excit": 0, "upcom": 0, "heavili": 0, "invest": 0, "upstream": 0, "matur": 0, "stori": 0, "xlafullyshardeddataparallel": 0, "my_modul": 0, "adam": 0, "0001": 0, "sum": 0, "individu": 0, "layer": 0, "outer": 0, "wrapper": 0, "leftov": 0, "arxiv": 0, "ab": 0, "1910": 0, "02054": 0, "reshard_after_forward": 0, "test_train_mp_mnist_fsdp_with_ckpt": 0, "test_train_mp_imagenet_fsdp": 0, "interleav": 0, "submodul": 0, "inner": 0, "fsdpvitmodel": 0, "ronghanghu": 0, "vit_10b_fsdp_exampl": 0, "run_vit_train": 0, "checkpoint_modul": 0, "pull": 0, "3524": 0, "auto_wrap_polici": 0, "size_based_auto_wrap_polici": 0, "polici": 0, "100m": 0, "transformer_auto_wrap_polici": 0, "conv2d": 0, "partial": [0, 1], "transformer_layer_cl": 0, "auto_wrapper_cal": 0, "remateri": 0, "m": [0, 1], "kwarg": 0, "resum": 0, "get_shard_metadata": 0, "consolidate_sharded_model_checkpoint": 0, "stitch": 0, "ckpt": 0, "shard_metadata": 0, "ckpt_path": 0, "f": 0, "pth": 0, "bash": 0, "consolidate_sharded_ckpt": 0, "ckpt_prefix": 0, "your_sharded_checkpoint_fil": 0, "ckpt_suffix": 0, "_rank": 0, "inspir": 0, "fairscal": 0, "fullyshardeddataparallel": 0, "readthedoc": 0, "en": 0, "explicit": 0, "resort": 0, "approach": [0, 1], "newer": 0, "recurs": 0, "98": 0, "accuraci": 0, "drop_last": 0, "use_nested_fsdp": 0, "use_gradient_checkpoint": 0, "final_ckpt": 0, "75": 0, "1k": 0, "datadir": 0, "test_set_batch_s": 0, "eval_interv": 0, "num_warmup_epoch": 0, "lr_scheduler_divide_every_n_epoch": 0, "lr_scheduler_divisor": 0, "residu": 0, "algorithm": 0, "vision": 0, "vit": 0, "against": 0, "abil": [0, 1], "And": [0, 1], "who": 0, "eager": [0, 1], "mode": [0, 1], "know": [0, 1], "nccl": 0, "new_rank": 0, "ddp_model": 0, "final": 0, "launcher": 0, "demo_fn": 0, "everyth": [0, 1], "touch": 0, "plu": 0, "five": 0, "sy": 0, "tempfil": 0, "cleanup": 0, "destroy_process_group": 0, "toymodel": 0, "__init__": 0, "net1": 0, "1000000": 0, "relu": 0, "net2": 0, "demo_bas": 0, "assert": 0, "graident_as_bucket_view": 0, "label": 0, "run_demo": 0, "tot": 0, "statist": 0, "produc": [0, 1], "median": 0, "90th": 0, "std": 0, "cv": 0, "418": 0, "54": 0, "419": 0, "22": 0, "430": 0, "40": 0, "76": 0, "02": 0, "97": 0, "407": 0, "60": 0, "39": 0, "seem": 0, "extra": 0, "17864": 0, "20108": 0, "96": 0, "24351": 0, "74": 0, "5866": 0, "83": 0, "10701": 0, "11770": 0, "00": 0, "14313": 0, "78": 0, "3102": 0, "92": 0, "round": 0, "sens": 0, "amort": 0, "logdir": 0, "converg": 0, "achiev": 0, "48": 0, "caution": 0, "interest": 0, "enforc": [0, 1], "crash": 0, "nvidia": 0, "driver": 0, "publish": 0, "prebuilt": 0, "cuda11": 0, "config": 0, "central1": 0, "pkg": 0, "nightly_3": 0, "8_cuda_12": 0, "toolkit": 0, "datacent": 0, "latest": 0, "curl": 0, "fssl": 0, "libnvidia": 0, "gpgkei": 0, "gpg": 0, "dearmor": 0, "o": 0, "keyr": 0, "l": 0, "deb": 0, "sed": 0, "sign": 0, "tee": 0, "d": [0, 1], "ctk": 0, "systemctl": 0, "shm": 0, "16g": 0, "bin": 0, "exec": 0, "ps": 0, "awk": 0, "nr": 0, "visibl": [0, 1], "smi": 0, "verifi": 0, "20ab2c7a2d06": 0, "dec": 0, "06": 0, "24": 0, "2022": 0, "510": 0, "47": 0, "03": 0, "bu": 0, "disp": 0, "volatil": 0, "uncorr": 0, "ecc": 0, "fan": 0, "temp": 0, "perf": [0, 1], "pwr": 0, "cap": 0, "mig": 0, "tesla": 0, "v100": 0, "sxm2": 0, "00000000": 0, "36c": 0, "p0": 0, "38w": 0, "300w": 0, "0mib": 0, "16384mib": 0, "gi": 0, "pid": 0, "ld_library_path": 0, "echo": 0, "link": 0, "bashrc": 0, "lib64": 0, "cp310": 0, "manylinux_2_28_x86_64": 0, "prepar": 0, "begin": 0, "38": 0, "89059": 0, "82": 0, "globalr": 0, "79297": 0, "117": 0, "45": 0, "84": 0, "36": 0, "43628": 0, "281": 0, "83108": 0, "346": 0, "88": 0, "108": 0, "99023": 0, "373": 0, "132": 0, "56": 0, "92699": 0, "384": 0, "152": 0, "14": 0, "120": 0, "68816": 0, "388": 0, "169": 0, "09": 0, "use_cuda": 0, "xla_cuda": 0, "successfulli": 0, "gspmd": 0, "annot": 0, "ml": 0, "proper": 0, "hint": 0, "figur": 0, "strategi": 0, "non": [0, 1], "suit": 0, "framework": [0, 1], "popular": 0, "shift": 0, "unlock": 0, "advanc": 0, "use_spmd": 0, "b8b484515a97f74e013dcf38125c44d53a41f011": 0, "l214": 0, "is_spmd": 0, "mechan": 0, "mark_shard": 0, "src": 0, "union": 0, "partition_spec": 0, "logic": [0, 1], "xlatensor": 0, "rfc": 0, "np": 0, "xs": 0, "mesh_shap": 0, "num_devic": 0, "global_runtime_device_count": 0, "device_id": 0, "m1_shard": 0, "isinst": 0, "techniqu": 0, "comment": [0, 1], "simplelinear": 0, "fc1": 0, "enumer": 0, "sharidng": 0, "spatial": 0, "ouput": 0, "cluster": 0, "interconnect": 0, "deriv": 0, "sub": 0, "abstract": 0, "axis_nam": 0, "nad": 0, "get_logical_mesh": 0, "ordereddict": 0, "tile": 0, "intend": 0, "manipul": 0, "nice": 0, "arrang": 0, "involv": [0, 1], "center": 0, "dcn": 0, "hybridmesh": 0, "box": 0, "multislic": 0, "accept": 0, "ici_mesh_shap": 0, "dcn_mesh_shap": 0, "denot": 0, "device_mesh": 0, "axi": 0, "input_tensor": 0, "evenli": 0, "equival": 0, "3d": 0, "2d": [0, 1], "two_d_partially_repl": 0, "two_d_fully_shard": 0, "4d": 0, "four_d": 0, "delai": 0, "carri": [0, 1], "lazili": [0, 1], "satisfi": 0, "subclass": 0, "__torch_dispatch__": 0, "spmdpartition": 0, "global_tensor": 0, "strictli": 0, "local_shard": 0, "xlashard": 0, "4e8e5511555073ce8b6d1a436bf808c9333dcac6": 0, "xla_sharded_tensor": 0, "l12": 0, "ongo": 0, "effort": [0, 1], "distributedtensor": 0, "prototyp": 0, "proof": 0, "concept": 0, "distribute_tensor": 0, "devicemesh": 0, "big_tensor": 0, "100000": 0, "my_dtensor": 0, "tune": 0, "_input_sharding_": 0, "field": 0, "input_mesh": 0, "dedic": 0, "planner": 0, "spmdsaveplann": 0, "spmdloadplann": 0, "dist_cp": 0, "distributed_checkpoint": 0, "xc": 0, "storage_writ": 0, "filesystemwrit": 0, "checkpoint_dir": 0, "storage_read": 0, "filesystemread": 0, "manag": 0, "all_step": 0, "save_async": 0, "unblock": 0, "dispatch": 0, "preemption": 0, "detect": 0, "termin": 0, "provis": 0, "queuedresourc": 0, "autocheckpoint": 0, "chkpt_on_preempt": 0, "fsspec": 0, "filesystem": 0, "gc": 0, "chkpt_mgr": 0, "gs": 0, "my": [0, 1], "bucket": [0, 1], "tracked_step": 0, "choos": 0, "highest": 0, "best_step": 0, "discov": 0, "_after": 0, "_the": 0, "unnecessari": 0, "forth": 0, "decis": 0, "hardcod": 0, "leverag": 0, "num_gpu_machin": 0, "rank_of_current_machin": 0, "machine_0_ip_address": 0, "training_or_inference_script_using_spmd": 0, "xla_use_spmd": 0, "test_train_spmd_imagenet": 0, "0th": 0, "w": 0, "h": 0, "highli": 0, "param": 0, "named_paramet": 0, "quick": 0, "plai": 0, "512": 0, "throughput": 0, "am": 0, "roughli": 0, "4x": 0, "placement": 0, "visualize_shard": 0, "rich": 0, "generated_t": 0, "use_color": 0, "alt_text": 0, "asset": 0, "spmd_debug_1": 0, "png": 0, "style": 0, "partial_repl": 0, "envvar": 0, "xla_spmd_auto": 0, "_tensor": 0, "auto_polici": 0, "device_count": 0, "mymodul": 0, "sharded_model": 0, "behvaior": 0, "xla_auto_use_group_shard": 0, "reshard": 0, "xla_auto_spmd_mesh": 0, "unset": 0, "fsdpv2": 0, "epxress": 0, "famou": 0, "aim": [0, 1], "offer": 0, "enjoi": 0, "bring": 0, "tabl": 0, "review": 0, "spmd_fully_sharded_data_parallel": 0, "spmdfullyshardeddataparallel": 0, "autowrap": 0, "shard_output": 0, "categori": 0, "element": [0, 1], "logit": 0, "children": 0, "infinit": 0, "compil": 1, "xla": 1, "ar": 1, "static": 1, "word": 1, "chang": 1, "hurt": 1, "understand": 1, "debug": 1, "user": 1, "pov": 1, "sai": 1, "support": 1, "assur": 1, "gone": 1, "coverag": 1, "detail": 1, "explan": 1, "common": 1, "do": 1, "get": 1, "rid": 1, "mainli": 1, "problem": 1, "beginn": 1, "propos": 1, "here": 1, "reli": 1, "impract": 1, "assumpt": 1, "ye": 1, "exampl": 1, "differ": 1, "sentenc": 1, "length": 1, "ll": 1, "new": 1, "more": 1, "kinda": 1, "anti": 1, "pytorch": 1, "frontend": 1, "lazi": 1, "matter": 1, "cpu": 1, "workaround": 1, "okai": 1, "teach": 1, "data": 1, "torch": 1, "theoret": 1, "memori": 1, "sort": 1, "devic": 1, "node": 1, "obviou": 1, "s64": 1, "8": 1, "num_output": 1, "10": 1, "mul": 1, "inde": 1, "_get_xla_tensor_dimension_s": 1, "commonli": 1, "basic": 1, "correct": 1, "wrong": 1, "wors": 1, "take": 1, "probabl": 1, "how": 1, "upper": 1, "nit": 1, "api": 1, "rand": 1, "solv": 1, "should": 1, "kept": 1, "earli": 1, "accessor": 1, "implicitli": 1, "current": 1, "overload": 1, "IS": 1, "A": 1, "explod": 1, "cheap": 1, "ve": 1, "variant": 1, "hoc": 1, "think": 1, "verison": 1, "x": 1, "bla": 1, "blabla": 1, "execut": 1, "proce": 1, "choic": 1, "multipl": 1, "combin": 1, "wide": 1, "adopt": 1, "code": 1, "uglier": 1, "win": 1, "pars": 1, "automat": 1, "torchscript": 1, "somehow": 1, "merg": 1, "properli": 1, "haven": 1, "thought": 1, "trivial": 1, "side": 1, "That": 1, "why": 1, "hit": 1, "bandwidth": 1, "automag": 1, "gold": 1, "smart": 1, "trick": 1, "tbh": 1, "longer": 1, "y": 1, "The": 1, "unawar": 1, "hope": 1, "smash": 1, "an": 1, "blocker": 1, "ahead": 1, "nnc": 1, "symbol": 1, "optim": 1, "concret": 1, "kernel": 1, "exactli": 1, "transpos": 1, "With": 1, "brian": 1, "hirsh": 1, "bdhirsh": 1, "question": 1, "move": 1, "Is": 1, "stick": 1, "torch_warn": 1, "yea": 1, "hei": 1, "won": 1, "run": 1, "blaze": 1, "fast": 1, "isn": 1, "model": 1, "devirtu": 1, "sound": 1, "great": 1, "truth": 1, "irvalu": 1, "discrep": 1, "followup": 1, "1000": 1, "singl": 1, "presenc": 1, "get_dimention_s": 1, "didn": 1, "condit": 1, "exponenti": 1, "blowup": 1, "fewer": 1, "opportun": 1, "feasibl": 1, "annoi": 1, "variabl": 1, "wasn": 1, "materiz": 1, "combo": 1}, "objects": {"torch_xla.core": [[0, 0, 0, "-", "functions"], [0, 0, 0, "-", "xla_model"]], "torch_xla.core.functions": [[0, 1, 1, "", "all_gather"], [0, 1, 1, "", "all_reduce"]], "torch_xla.core.xla_model": [[0, 1, 1, "", "add_step_closure"], [0, 1, 1, "", "all_gather"], [0, 1, 1, "", "all_reduce"], [0, 1, 1, "", "all_to_all"], [0, 1, 1, "", "do_on_ordinals"], [0, 1, 1, "", "get_local_ordinal"], [0, 1, 1, "", "get_memory_info"], [0, 1, 1, "", "get_ordinal"], [0, 1, 1, "", "get_rng_state"], [0, 1, 1, "", "get_stablehlo"], [0, 1, 1, "", "get_stablehlo_bytecode"], [0, 1, 1, "", "get_xla_supported_devices"], [0, 1, 1, "", "is_master_ordinal"], [0, 1, 1, "", "mesh_reduce"], [0, 1, 1, "", "optimizer_step"], [0, 1, 1, "", "rendezvous"], [0, 1, 1, "", "save"], [0, 1, 1, "", "set_rng_state"], [0, 1, 1, "", "wait_device_ops"], [0, 1, 1, "", "xla_device"], [0, 1, 1, "", "xla_device_hw"], [0, 1, 1, "", "xrt_world_size"]], "torch_xla.distributed": [[0, 0, 0, "-", "parallel_loader"], [0, 0, 0, "-", "xla_multiprocessing"]], "torch_xla.distributed.parallel_loader": [[0, 2, 1, "", "ParallelLoader"]], "torch_xla.distributed.parallel_loader.ParallelLoader": [[0, 3, 1, "", "per_device_loader"]], "torch_xla.distributed.xla_multiprocessing": [[0, 2, 1, "", "MpModelWrapper"], [0, 2, 1, "", "MpSerialExecutor"], [0, 1, 1, "", "spawn"]], "torch_xla.distributed.xla_multiprocessing.MpModelWrapper": [[0, 3, 1, "", "to"]], "torch_xla.distributed.xla_multiprocessing.MpSerialExecutor": [[0, 3, 1, "", "run"]], "torch_xla.utils": [[0, 0, 0, "-", "serialization"], [0, 0, 0, "-", "utils"]], "torch_xla.utils.serialization": [[0, 1, 1, "", "load"], [0, 1, 1, "", "save"]], "torch_xla.utils.utils": [[0, 2, 1, "", "DataWrapper"], [0, 2, 1, "", "SampleGenerator"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"pytorch": 0, "xla": 0, "devic": 0, "creat": 0, "an": 0, "tensor": [0, 1], "ar": 0, "run": 0, "model": 0, "singl": 0, "multipl": 0, "multi": 0, "process": 0, "tpu": 0, "pod": 0, "deep": 0, "dive": 0, "lazi": 0, "bfloat16": 0, "memori": 0, "layout": 0, "move": 0, "from": [0, 1], "cpu": 0, "save": 0, "load": 0, "compil": 0, "cach": 0, "further": 0, "read": 0, "api": 0, "xla_model": 0, "distribut": 0, "util": 0, "test": 0, "beginn": 0, "s": [0, 1], "guid": 0, "basic": 0, "high": 0, "level": 0, "understand": 0, "some": [0, 1], "detail": 0, "setup": 0, "convert": 0, "code": 0, "exampl": 0, "1": [0, 1], "stabl": 0, "diffus": 0, "infer": 0, "lightn": 0, "2": [0, 1], "hf": 0, "profil": 0, "perform": 0, "analysi": 0, "troubleshoot": 0, "saniti": 0, "check": 0, "version": 0, "A": 0, "simpl": 0, "calcul": 0, "resnet": 0, "With": 0, "fake": 0, "data": 0, "debug": 0, "tool": 0, "dynamo": 0, "auto": 0, "metric": 0, "execut": 0, "get": 0, "report": 0, "The": 0, "clear": 0, "known": 0, "caveat": 0, "quirk": 0, "more": 0, "environ": 0, "variabl": 0, "common": 0, "combin": 0, "reproduc": 0, "ci": 0, "cd": 0, "unit": 0, "failur": 0, "pjrt": 0, "runtim": 0, "tl": 0, "dr": 0, "benefit": 0, "quickstart": 0, "docker": 0, "gpu": 0, "node": 0, "train": 0, "differ": 0, "xrt": 0, "multithread": 0, "v2": 0, "v3": 0, "chang": 0, "xm": 0, "rendezv": 0, "torch": 0, "new": 0, "torchdynamo": 0, "integr": 0, "featur": 0, "gap": 0, "take": 0, "awai": 0, "fulli": 0, "shard": 0, "parallel": 0, "fsdp": 0, "script": 0, "mnist": 0, "imagenet": 0, "instal": 0, "clone": 0, "repo": 0, "8": 0, "50": 0, "10": 0, "billion": 0, "paramet": 0, "how": 0, "do": 0, "distributeddataparallel": 0, "background": 0, "motiv": 0, "us": [0, 1], "benchmark": 0, "resnet50": 0, "real": [0, 1], "disclaim": 0, "instanc": 0, "wheel": 0, "amp": 0, "automat": 0, "mix": 0, "precis": 0, "develop": 0, "build": 0, "sourc": [0, 1], "support": 0, "spmd": 0, "user": 0, "what": [0, 1], "design": 0, "overview": 0, "aannot": 0, "mesh": 0, "hybrid": 0, "partit": 0, "spec": 0, "xlashardedtensor": 0, "dtensor": 0, "awar": 0, "host": 0, "checkpoint": 0, "checkpointmanag": 0, "group": 0, "virtual": 0, "optim": 0, "number": 0, "refer": 0, "express": 0, "here": 0, "2x2": 0, "ax": 0, "x": 0, "y": 0, "can": [0, 1], "visual": 0, "visualize_tensor_shard": 0, "method": 0, "current": 0, "should": 0, "via": 0, "distribute_modul": 0, "output": [0, 1], "gradient": 0, "huggingfac": 0, "llama": 0, "recompil": 1, "torch_xla": 1, "let": 1, "first": 1, "start": 1, "fact": 1, "constraint": 1, "input": 1, "dataset": 1, "oper": 1, "bound": 1, "dynam": 1, "shape": 1, "fix": 1, "case": 1, "when": 1, "you": 1, "without": 1, "queri": 1, "its": 1, "dimens": 1, "3": 1, "control": 1, "flow": 1, "conclus": 1, "appendix": 1}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}})
\ No newline at end of file
+Search.setIndex({"docnames": ["index", "notes/source_of_recompilation"], "filenames": ["index.rst", "notes/source_of_recompilation.md"], "titles": ["PyTorch on XLA Devices", "Source of recompilations in torch_xla"], "terms": {"like": [0, 1], "torch_xla": 0, "packag": 0, "thi": [0, 1], "document": 0, "describ": 0, "your": [0, 1], "add": [0, 1], "type": 0, "work": [0, 1], "just": [0, 1], "other": [0, 1], "For": [0, 1], "print": [0, 1], "import": 0, "core": 0, "t": [0, 1], "randn": 0, "xla_devic": 0, "look": 0, "familiar": 0, "same": [0, 1], "interfac": 0, "regular": 0, "few": [0, 1], "addit": 0, "initi": 0, "return": [0, 1], "mai": [0, 1], "depend": [0, 1], "oper": 0, "cuda": [0, 1], "ad": [0, 1], "togeth": 0, "t0": 0, "t1": 0, "Or": [0, 1], "matrix": 0, "multipli": 0, "mm": 0, "neural": 0, "network": 0, "modul": 0, "l_in": 0, "linear": 0, "nn": 0, "20": 0, "l_out": 0, "onli": [0, 1], "so": [0, 1], "input": 0, "floattensor": 0, "throw": 0, "error": 0, "sinc": [0, 1], "exist": 0, "one": [0, 1], "requir": [0, 1], "line": [0, 1], "specif": 0, "follow": [0, 1], "snippet": 0, "highlight": 0, "when": 0, "show": 0, "loss_fn": 0, "nllloss": 0, "sgd": 0, "lr": 0, "momentum": 0, "target": [0, 1], "train_load": 0, "zero_grad": 0, "loss": 0, "backward": 0, "step": [0, 1], "mark_step": 0, "easi": [0, 1], "switch": [0, 1], "definit": [0, 1], "dataload": 0, "loop": [0, 1], "ani": [0, 1], "coupl": 0, "acquir": 0, "mark": 0, "call": [0, 1], "end": 0, "each": [0, 1], "iter": 0, "caus": [0, 1], "its": 0, "graph": [0, 1], "updat": [0, 1], "see": [0, 1], "make": [0, 1], "acceler": 0, "parallel_load": 0, "pl": 0, "xla_multiprocess": 0, "xmp": 0, "def": 0, "_mp_fn": 0, "index": 0, "mp_device_load": 0, "mpdeviceload": 0, "optimizer_step": 0, "__name__": 0, "__main__": 0, "spawn": 0, "arg": 0, "There": [0, 1], "three": 0, "between": [0, 1], "previou": [0, 1], "let": 0, "go": 0, "over": 0, "abl": [0, 1], "access": [0, 1], "assign": 0, "v4": 0, "4": [0, 1], "being": 0, "up": [0, 1], "own": 0, "note": [0, 1], "you": 0, "0": [0, 1], "all": [0, 1], "becaus": 0, "doe": [0, 1], "mean": [0, 1], "function": 0, "have": [0, 1], "thread": 0, "doc": [0, 1], "onto": 0, "wrap": 0, "It": [0, 1], "preload": 0, "overlap": 0, "improv": 0, "also": [0, 1], "everi": [0, 1], "batches_per_execut": 0, "default": 0, "batch": 0, "yield": 0, "consolid": 0, "issu": 0, "comput": [0, 1], "pretti": [0, 1], "much": [0, 1], "all_reduce_gradi": 0, "reduc": 0, "remain": [0, 1], "start": 0, "retriev": [0, 1], "within": 0, "which": [0, 1], "ha": [0, 1], "parent": 0, "stack": [0, 1], "full": 0, "multiprocess": 0, "veri": [0, 1], "talk": 0, "about": [0, 1], "independ": 0, "bit": 0, "avail": [0, 1], "13": 0, "releas": 0, "befor": [0, 1], "pleas": 0, "our": [0, 1], "explain": [0, 1], "googl": 0, "cloud": 0, "basi": 0, "gcloud": 0, "command": 0, "project": 0, "howto": 0, "focu": [0, 1], "perspect": 0, "assum": [0, 1], "abov": [0, 1], "section": 0, "train_mnist_xla": 0, "py": 0, "If": [0, 1], "would": [0, 1], "ssh": 0, "tpuvm": 0, "pjrt_devic": 0, "python3": 0, "now": [0, 1], "order": 0, "16": 0, "need": [0, 1], "sure": 0, "usual": 0, "done": [0, 1], "scp": 0, "copi": 0, "time": [0, 1], "alpha": 0, "vm": 0, "zone": 0, "worker": 0, "outsid": 0, "underli": 0, "infrastructur": 0, "global": 0, "topolog": 0, "local": 0, "ordin": 0, "cross": 0, "commun": 0, "happen": [0, 1], "across": 0, "instead": [0, 1], "regard": 0, "inform": [0, 1], "complet": 0, "fakedata": 0, "But": [0, 1], "even": [0, 1], "though": 0, "act": 0, "lot": [0, 1], "intern": [0, 1], "uniqu": [0, 1], "launch": 0, "immedi": 0, "eagerli": [0, 1], "hand": 0, "thei": [0, 1], "record": 0, "until": 0, "result": 0, "defer": 0, "separ": 0, "might": [0, 1], "fuse": 0, "gener": [0, 1], "invis": 0, "caller": 0, "construct": 0, "send": 0, "them": [0, 1], "synchron": 0, "insert": 0, "barrier": 0, "explicitli": [0, 1], "paper": 0, "datatyp": 0, "In": [0, 1], "fact": 0, "handl": [0, 1], "float": [0, 1], "doubl": [0, 1], "behavior": 0, "control": 0, "xla_use_bf16": 0, "xla_downcast_bf16": 0, "By": [0, 1], "both": [0, 1], "set": [0, 1], "float32": 0, "directli": [0, 1], "map": 0, "bf16": 0, "primit": 0, "alwai": [0, 1], "regardless": 0, "actual": [0, 1], "re": [0, 1], "convers": [0, 1], "opaqu": 0, "back": 0, "trigger": [0, 1], "represent": 0, "expos": 0, "storag": 0, "appear": 0, "contigu": 0, "unlik": 0, "allow": 0, "adjust": 0, "better": [0, 1], "view": 0, "relationship": 0, "preserv": 0, "put": 0, "anoth": [0, 1], "wai": [0, 1], "onc": [0, 1], "again": 0, "appreci": 0, "accommod": 0, "transit": 0, "pt": 0, "were": [0, 1], "per": 0, "care": [0, 1], "must": 0, "taken": [0, 1], "recommend": 0, "recreat": 0, "after": [0, 1], "been": [0, 1], "destin": 0, "provid": [0, 1], "previous": 0, "state_dict": 0, "path": [0, 1], "case": 0, "master": 0, "where": [0, 1], "limit": 0, "compar": 0, "size": [0, 1], "footprint": 0, "serial": 0, "xser": 0, "stream": 0, "amount": [0, 1], "match": 0, "restor": 0, "load_state_dict": 0, "possibl": 0, "unavail": 0, "fail": 0, "under": 0, "activ": 0, "futur": [0, 1], "trace": [0, 1], "hlo": 0, "consum": [0, 1], "doesn": [0, 1], "persist": 0, "disk": 0, "reus": 0, "significantli": 0, "recompil": 0, "still": [0, 1], "occur": 0, "experiment": 0, "opt": 0, "through": [0, 1], "initialize_cach": 0, "xr": 0, "your_cache_path": 0, "readonli": 0, "fals": 0, "specifi": 0, "whether": 0, "write": 0, "share": 0, "mount": 0, "workload": 0, "n": 0, "none": 0, "devkind": 0, "given": [0, 1], "python": [0, 1], "int": [0, 1], "option": 0, "otherwis": [0, 1], "first": 0, "string": 0, "custom": [0, 1], "deprec": 0, "request": [0, 1], "get_xla_supported_devic": 0, "max_devic": 0, "list": 0, "kind": 0, "name": [0, 1], "maximum": 0, "xla_device_hw": 0, "hardwar": 0, "get_ordin": 0, "defval": 0, "replic": 0, "rang": 0, "xrt_world_siz": 0, "minu": 0, "valu": [0, 1], "ignor": 0, "get_local_ordin": 0, "is_master_ordin": 0, "true": [0, 1], "bool": 0, "while": [0, 1], "num_host": 0, "boolean": 0, "indic": [0, 1], "part": 0, "all_reduc": 0, "reduce_typ": 0, "scale": 0, "pin_layout": 0, "inplac": 0, "One": 0, "reduce_sum": 0, "reduce_mul": 0, "reduce_and": 0, "reduce_or": 0, "reduce_min": 0, "reduce_max": 0, "either": [0, 1], "op": [0, 1], "appli": 0, "repres": [0, 1], "replica": 0, "3": 0, "5": [0, 1], "6": [0, 1], "7": 0, "defin": 0, "two": [0, 1], "pin": 0, "pine": 0, "prevent": 0, "potenti": 0, "corrupt": 0, "particip": 0, "slightli": 0, "program": [0, 1], "unpin": 0, "messag": 0, "hlomodul": 0, "constrain": 0, "pass": 0, "hold": 0, "tupl": [0, 1], "itself": 0, "all_gath": 0, "dim": 0, "gather": 0, "along": 0, "dimens": 0, "all_to_al": 0, "split_dimens": 0, "concat_dimens": 0, "split_count": 0, "alltoal": 0, "http": 0, "www": 0, "tensorflow": [0, 1], "org": 0, "operation_semant": 0, "upon": 0, "split": 0, "concat": 0, "count": 0, "add_step_closur": 0, "closur": 0, "run_async": 0, "ones": [0, 1], "mani": [0, 1], "dure": 0, "consol": 0, "post": 0, "tensorboard": 0, "etc": [0, 1], "content": 0, "intermediari": 0, "inspect": 0, "point": [0, 1], "typic": 0, "ensur": [0, 1], "live": [0, 1], "alreadi": [0, 1], "materi": [0, 1], "includ": [0, 1], "captur": 0, "argument": 0, "queu": 0, "sequenti": 0, "advis": 0, "throttl": 0, "event": 0, "callabl": 0, "asynchron": 0, "wait_device_op": 0, "wait": 0, "async": 0, "whose": 0, "empti": 0, "optimizer_arg": 0, "parallelload": 0, "dataparallel": 0, "necessari": 0, "loader": 0, "next": [0, 1], "dict": 0, "dictionari": 0, "file_or_path": 0, "master_onli": 0, "global_mast": 0, "file": 0, "transfer": 0, "nest": 0, "object": 0, "overrid": 0, "locat": 0, "flag": 0, "sync": 0, "main": 0, "hang": 0, "tag": 0, "payload": 0, "b": [0, 1], "client": 0, "reach": 0, "server": 0, "effect": 0, "alia": 0, "xla_rendezv": 0, "join": 0, "byte": 0, "sent": 0, "exchang": 0, "i": [0, 1], "posit": 0, "do_on_ordin": 0, "contain": [0, 1], "interpret": [0, 1], "ran": 0, "mesh_reduc": 0, "reduce_fn": 0, "out": 0, "reduct": 0, "receiv": 0, "come": [0, 1], "set_rng_stat": 0, "seed": 0, "random": 0, "state": 0, "integ": 0, "rng": 0, "miss": 0, "get_rng_stat": 0, "get_memory_info": 0, "kb_free": 0, "free": 0, "kb": 0, "kb_total": 0, "total": [0, 1], "kei": 0, "get_stablehlo": 0, "str": 0, "stablehlo": 0, "format": 0, "dump": 0, "whole": [0, 1], "todo": 0, "lsy323": 0, "intermedi": 0, "investig": 0, "straightforward": 0, "identifi": 0, "To": [0, 1], "enabl": 0, "info": [0, 1], "env": 0, "var": 0, "xla_hlo_debug": 0, "root": [0, 1], "get_stablehlo_bytecod": 0, "bytecod": 0, "autograd": 0, "differenti": 0, "select": 0, "class": 0, "batchdim": 0, "loader_prefetch_s": 0, "device_prefetch_s": 0, "host_to_device_transfer_thread": 0, "input_shard": 0, "upload": 0, "th": 0, "sampl": 0, "len": 0, "max": [0, 1], "capac": 0, "queue": 0, "deposit": 0, "shardingspec": 0, "compat": 0, "per_device_load": 0, "structur": 0, "resid": 0, "fn": 0, "nproc": 0, "daemon": 0, "start_method": 0, "base": [0, 1], "At": 0, "moment": 0, "block": 0, "creation": 0, "mpmodelwrapp": 0, "minim": 0, "usag": [0, 1], "fork": 0, "henc": 0, "scope": 0, "insid": 0, "wrapped_model": 0, "mynetwork": 0, "advantag": 0, "page": 0, "origin": 0, "weight": 0, "second": 0, "lower": [0, 1], "system": 0, "mpserialexecutor": 0, "fashion": 0, "among": 0, "serial_exec": 0, "load_dataset": 0, "maybe_download_and_load": 0, "avoid": 0, "download": 0, "executor": 0, "dataset": 0, "lambda": 0, "tmp": 0, "wrt": 0, "samplegener": 0, "sample_count": 0, "place": 0, "synthet": 0, "datawrapp": 0, "illustr": 0, "e": [0, 1], "g": [0, 1], "solut": [0, 1], "howev": 0, "serv": 0, "brief": 0, "help": [0, 1], "reader": 0, "modif": 0, "supplement": 0, "fetch": 0, "ir": [0, 1], "encount": 0, "discuss": 0, "below": [0, 1], "lazytensor": 0, "machin": 0, "readabl": 0, "opcod": 0, "effici": 0, "fed": 0, "later": 0, "except": 0, "four": 0, "slice": 0, "attach": 0, "shape": 0, "evalu": [0, 1], "sometim": [0, 1], "lead": 0, "signific": 0, "log": 0, "callback": 0, "slower": 0, "xla_tensor_z": 0, "respons": 0, "These": 0, "cut": [0, 1], "transferfromdevic": 0, "special": 0, "instruct": 0, "tell": [0, 1], "break": [0, 1], "properti": [0, 1], "suppos": 0, "we": [0, 1], "tensors_on_devic": 0, "z": [0, 1], "without": 0, "subgraph": [0, 1], "correspond": 0, "introduc": 0, "smaller": [0, 1], "signal": 0, "far": 0, "submit": 0, "seen": 0, "small": [0, 1], "bottleneck": 0, "dynam": 0, "suitabl": 0, "pad": [0, 1], "fix": 0, "spent": 0, "trade": [0, 1], "off": 0, "consid": 0, "larg": [0, 1], "spend": 0, "fusion": 0, "long": [0, 1], "faster": [0, 1], "due": 0, "worth": [0, 1], "As": [0, 1], "could": [0, 1], "latter": 0, "job": 0, "imag": [0, 1], "nightli": 0, "runtime_vers": 0, "export": 0, "central2": 0, "project_id": 0, "id": 0, "accelerator_typ": 0, "32": 0, "tpu_nam": 0, "your_tpu_nam": 0, "subnetwork": 0, "tpusubnet": 0, "similar": 0, "pip3": 0, "googleapi": 0, "com": 0, "cp38": 0, "linux_x86_64": 0, "whl": 0, "librari": 0, "sudo": 0, "apt": 0, "libopenbla": 0, "dev": 0, "libgl1": 0, "guidelin": 0, "modifi": 0, "replac": 0, "remov": 0, "progress": 0, "bar": 0, "rememb": 0, "someth": 0, "txt2img": 0, "prompt": 0, "photograph": 0, "astronaut": 0, "ride": 0, "hors": 0, "diff": 0, "found": 0, "relat": 0, "written": 0, "precision_scop": 0, "addition": 0, "particular": 0, "configur": 0, "frozenclipembedd": 0, "therefor": 0, "well": [0, 1], "simplic": [0, 1], "tutori": 0, "self": 0, "ddim": 0, "schedul": 0, "top": 0, "attr": 0, "statement": [0, 1], "disabl": 0, "stop": 0, "fall": 0, "especi": 0, "slow": 0, "tri": 0, "huge": [0, 1], "difficult": 0, "degrad": 0, "easier": [0, 1], "readi": 0, "reason": 0, "cover": 0, "sd": 0, "xl": 0, "git": 0, "github": 0, "text_to_imag": 0, "inference_tpu_single_devic": 0, "speed": [0, 1], "inference_tpu_multidevic": 0, "warn": 0, "watch": 0, "made": [0, 1], "lora": 0, "model_id": 0, "stabilityai": 0, "9": [0, 1], "pipelin": 0, "dpmsolvermultistepschedul": 0, "pip": 0, "r": 0, "txt": 0, "invisible_watermark": 0, "transform": 0, "safetensor": 0, "agre": 0, "licens": 0, "card": 0, "account": 0, "token": 0, "cli": 0, "login": 0, "_your_copied_token__": 0, "readm": 0, "pipe": 0, "hour": 0, "wherea": 0, "likewis": 0, "gpt": 0, "15": 0, "min": 0, "epoch": 0, "becom": [0, 1], "subsequ": 0, "mayb": 0, "try": [0, 1], "notic": 0, "piec": 0, "__call__": 0, "commit": 0, "nonzero": [0, 1], "item": 0, "desir": 0, "down": 0, "rule": 0, "thumb": 0, "fit": 0, "durat": 0, "enough": [0, 1], "least": 0, "good": [0, 1], "constantli": 0, "idl": 0, "inference_tpu_": 0, "capture_profil": 0, "give": 0, "xp": 0, "measur": 0, "u": 0, "net": 0, "analyz": 0, "cannot": [0, 1], "portion": 0, "busi": 0, "middl": [0, 1], "scroll": 0, "find": 0, "occupi": 0, "demonstr": 0, "displai": 0, "largest": 0, "appropri": 0, "zoom": 0, "timelin": 0, "period": 0, "examin": 0, "thing": 0, "did": 0, "pipe_watermark": 0, "closer": 0, "preced": 0, "finish": 0, "proceed": 0, "watermark": 0, "numpi": 0, "arrai": 0, "cv2": 0, "pywt": 0, "leav": 0, "broken": 0, "rewrit": [0, 1], "those": 0, "rerun": 0, "wa": [0, 1], "scale_model_input": 0, "overal": 0, "my_funct": 0, "Then": 0, "your_script": 0, "magic": [0, 1], "treat": 0, "constant": 0, "xla_no_special_scalar": 0, "subject": 0, "softwar": 0, "peculiar": 0, "implement": [0, 1], "depth": 0, "want": [0, 1], "detial": 0, "__version__": 0, "cu121": 0, "100": 0, "t2": 0, "200": 0, "300": 0, "test_train_mp_imagenet": 0, "fake_data": 0, "branch": [0, 1], "rx": 0, "r2": 0, "conclud": 0, "correctli": 0, "diagnos": 0, "counter": 0, "extrem": 0, "bug": 0, "pt_xla_debug": 0, "xla_dynamo_debug": 0, "summari": 0, "compiletim": 0, "too": [0, 1], "frequent": 0, "21": 0, "11": [0, 1], "transferfromdevicetim": 0, "aten": [0, 1], "_ctc_loss": 0, "_ctc_loss_backward": 0, "open": 0, "23": 0, "12": 0, "hash": 0, "537d4b0264b029688281412214d252e9": 0, "588": 0, "320": 0, "frame": 0, "workspac": 0, "dk2": 0, "840": 0, "broadcast_master_param": 0, "1230": 0, "train_imagenet": 0, "261": 0, "365": 0, "_intern": 0, "176": 0, "_thread_fn": 0, "70": 0, "usr": 0, "lib": 0, "concurr": 0, "57": 0, "_worker": 0, "80": 0, "manual": 0, "exit": 0, "steptrac": 0, "region": 0, "decid": [0, 1], "often": [0, 1], "expect": [0, 1], "frequenc": 0, "pair": 0, "stabil": 0, "keep": [0, 1], "met": 0, "short": [0, 1], "short_metrics_report": 0, "metrics_report": 0, "destroi": 0, "term": [0, 1], "percentil": 0, "totalsampl": 0, "202": 0, "06m09s401ms746": 0, "001u": 0, "valuer": 0, "778ms572": 0, "062u": 0, "rate": 0, "425201": 0, "001ms32": 0, "778u": 0, "001ms61": 0, "283u": 0, "001ms79": 0, "236u": 0, "001ms110": 0, "973u": 0, "001ms228": 0, "773u": 0, "001ms339": 0, "183u": 0, "90": 0, "001ms434": 0, "305u": 0, "95": 0, "002ms921": 0, "063u": 0, "99": 0, "21s102ms853": 0, "173u": 0, "track": 0, "statu": 0, "cachedsynctensor": 0, "395": 0, "context": [0, 1], "area": 0, "rout": 0, "engin": 0, "qualifi": 0, "c": [0, 1], "namespac": 0, "33": 0, "than": [0, 1], "_local_scalar_dens": 0, "feel": 0, "clear_al": 0, "resourc": 0, "offici": 0, "colab": 0, "notebook": 0, "behav": 0, "semant": [0, 1], "constraint": 0, "suggest": 0, "certain": [0, 1], "pattern": [0, 1], "bad": 0, "mind": 0, "expens": [0, 1], "speedup": 0, "rest": [0, 1], "direct": 0, "indirect": 0, "mask": [0, 1], "thu": 0, "low": 0, "variat": 0, "don": [0, 1], "nativ": 0, "translat": 0, "slowdown": 0, "ask": [0, 1], "unless": [0, 1], "most": 0, "checkout": 0, "scalar": [0, 1], "substitut": 0, "flow": 0, "applic": 0, "clip_grad": 0, "norm": 0, "problemat": 0, "impact": [0, 1], "patch": 0, "clip_grad_norm_": 0, "dramat": 0, "els": [0, 1], "total_norm": 0, "zero": 0, "p": [0, 1], "param_norm": 0, "grad": 0, "norm_typ": 0, "add_": 0, "clip_coef": 0, "max_norm": 0, "1e": 0, "mul_": 0, "data_parallel": 0, "drop": 0, "last": 0, "stride": 0, "reconstruct": 0, "shallow": 0, "ty": 0, "_xlac": [0, 1], "_get_xla_tensors_text": [0, 1], "_get_xla_tensors_hlo": 0, "prior": 0, "degre": 0, "xla_ir_debug": 0, "_xla_ir": 0, "propag": 0, "metadata": 0, "xla_save_tensors_fil": 0, "realli": [0, 1], "big": [0, 1], "left": 0, "append": 0, "clean": 0, "sheet": 0, "xla_save_tensors_fmt": 0, "store": 0, "_xla_save_tensor": 0, "text": 0, "dot": 0, "graphviz": 0, "xla_flag": 0, "xla_dump_to": 0, "dir_nam": 0, "unoptim": 0, "optimz": 0, "xla_metrics_fil": 0, "xla_save_hlo_fil": 0, "offend": 0, "xla_sync_wait": 0, "forc": [0, 1], "xla_use_eager_debug_mod": 0, "bypass": 0, "higher": 0, "optimizaiton": 0, "skip": 0, "bifloat16": 0, "arithmet": 0, "accur": 0, "accumul": 0, "4096": 0, "dtype": [0, 1], "4097": 0, "averag": 0, "stai": 0, "fp32": 0, "xla_use_f16": 0, "float16": 0, "half": 0, "tf_cpp_log_thread_id": 0, "tf": [0, 1], "tf_cpp_vmodul": 0, "vlog": 0, "form": [0, 1], "tf_cpp_min_log_level": 0, "turn": 0, "tf_vlog": 0, "xla_dump_hlo_graph": 0, "rais": 0, "xla_util": 0, "cc": 0, "save1": 0, "xla_graph_executor": 0, "pjrt_computation_cli": 0, "pr": 0, "dir": 0, "pytorch_test_with_slow": 0, "test_torch": 0, "k": 0, "test_put_xla_uint8": 0, "torch_test_devic": 0, "pytorch_test_bas": 0, "migrat": 0, "jax": 0, "public": 0, "init": 0, "renam": 0, "regist": 0, "xla_backend": 0, "torchrun": 0, "init_method": 0, "plugin": 0, "xpu": 0, "neuron": 0, "continu": 0, "xrt_tpu_config": 0, "libtpu": 0, "30": 0, "thousand": 0, "preview": 0, "On": 0, "safe": 0, "broadcast": 0, "manual_se": 0, "pjrt_backend": 0, "os": 0, "ddp": 0, "dist": 0, "init_process_group": 0, "rank": 0, "world_siz": 0, "42": 0, "128": 0, "gradient_as_bucket_view": 0, "mseloss": 0, "001": 0, "confirm": 0, "localservic": 0, "localhost": 0, "51011": 0, "master_addr": 0, "master_port": 0, "12355": 0, "overhead": 0, "grpc": 0, "torchbench": 0, "observ": 0, "35": 0, "2048": 0, "chip": 0, "learn": 0, "test_train_mp_mnist": 0, "batch_siz": 0, "256": 0, "num_epoch": 0, "tpu_process_bound": 0, "tpu_visible_chip": 0, "r1": 0, "preinstal": 0, "docker_imag": 0, "gcr": 0, "io": 0, "authent": 0, "privat": 0, "gcp": 0, "repositori": 0, "auth": 0, "rm": 0, "privileg": 0, "simpli": 0, "gpu_num_devic": 0, "nnode": 0, "num_gpu_devic": 0, "pjrt_distribut": 0, "physic": 0, "number_gpu_vm": 0, "node_rank": 0, "current_node_rank": 0, "nproc_per_nod": 0, "number_local_gpu_devic": 0, "rdzv_endpoint": 0, "internal_ip_address": 0, "port": 0, "multinode_train": 0, "endpoint": 0, "ip": 0, "address": 0, "omit": 0, "machine_0": 0, "machine_1": 0, "machine_0_internal_ip_address": 0, "ident": 0, "although": [0, 1], "mostli": 0, "interchang": 0, "subtl": 0, "importantli": 0, "around": [0, 1], "architectur": 0, "latenc": 0, "deseri": 0, "gain": 0, "interact": 0, "normal": [0, 1], "plan": 0, "simpler": 0, "xla_dist": 0, "sdk": 0, "reimplement": 0, "collect": 0, "enhanc": 0, "substanti": 0, "vari": [0, 1], "consist": 0, "servic": 0, "connect": 0, "practic": [0, 1], "unreli": 0, "inbound": 0, "entir": 0, "restart": 0, "impos": 0, "unwant": 0, "permit": 0, "subset": 0, "old": 0, "alter": 0, "html": 0, "_": 0, "all_gather_object": 0, "gloo": 0, "backend": [0, 1], "new_group": 0, "subgroup": 0, "experi": 0, "altern": [0, 1], "less": [0, 1], "reliabl": 0, "blob": 0, "md": 0, "strongli": 0, "world": [0, 1], "queri": 0, "_all_gath": 0, "No": [0, 1], "int32": 0, "zeros_lik": 0, "get_world_s": 0, "task": 0, "175": 0, "chart": 0, "breakdown": 0, "tfrt": 0, "legaci": 0, "streamexecutor": 0, "tpu_legaci": 0, "shown": [0, 1], "vs": [0, 1], "comparison": 0, "jit": 0, "unmodifi": 0, "hook": 0, "biggest": 0, "right": [0, 1], "bridg": 0, "torchfx": 0, "recogn": [0, 1], "technolog": 0, "fx": 0, "openxla": 0, "a_xla": 0, "b_xla": 0, "compiled_cod": 0, "resnet18": 0, "torchvis": 0, "eval_model": 0, "xla_resnet18": 0, "eval": 0, "dynamo_resnet18": 0, "no_grad": 0, "resent18": 0, "binari": 0, "invok": 0, "bench": 0, "59": 0, "64": 0, "resnext50_32x4d": 0, "91": 0, "alexnet": 0, "28": 0, "mobilenet_v2": 0, "18": 0, "62": 0, "mnasnet1_0": 0, "68": 0, "vgg16": 0, "bert_pytorch": 0, "49": 0, "squeezenet1_1": 0, "29": 0, "timm_vision_transform": 0, "52": 0, "geomean": 0, "04": 0, "team": 0, "train_model": 0, "crossentropyloss": 0, "pred": 0, "train_model_main": 0, "dynamo_train_model": 0, "xla_optim": 0, "weight_decai": 0, "extract": 0, "07": 0, "43": 0, "19": 0, "81": 0, "87": 0, "41": 0, "fwd": 0, "bwd": 0, "e2": 0, "easili": [0, 1], "hide": 0, "cost": 0, "scenario": 0, "larger": 0, "forward": 0, "best": 0, "ideal": [0, 1], "promis": 0, "complex": 0, "tradit": 0, "hard": [0, 1], "expand": 0, "mention": [0, 1], "super": 0, "excit": 0, "upcom": 0, "heavili": 0, "invest": 0, "upstream": 0, "matur": 0, "stori": 0, "xlafullyshardeddataparallel": 0, "my_modul": 0, "adam": 0, "0001": 0, "sum": 0, "individu": 0, "layer": 0, "outer": 0, "wrapper": 0, "leftov": 0, "arxiv": 0, "ab": 0, "1910": 0, "02054": 0, "reshard_after_forward": 0, "test_train_mp_mnist_fsdp_with_ckpt": 0, "test_train_mp_imagenet_fsdp": 0, "interleav": 0, "submodul": 0, "inner": 0, "fsdpvitmodel": 0, "ronghanghu": 0, "vit_10b_fsdp_exampl": 0, "run_vit_train": 0, "checkpoint_modul": 0, "pull": 0, "3524": 0, "auto_wrap_polici": 0, "size_based_auto_wrap_polici": 0, "polici": 0, "100m": 0, "transformer_auto_wrap_polici": 0, "conv2d": 0, "partial": [0, 1], "transformer_layer_cl": 0, "auto_wrapper_cal": 0, "remateri": 0, "m": [0, 1], "kwarg": 0, "resum": 0, "get_shard_metadata": 0, "consolidate_sharded_model_checkpoint": 0, "stitch": 0, "ckpt": 0, "shard_metadata": 0, "ckpt_path": 0, "f": 0, "pth": 0, "bash": 0, "consolidate_sharded_ckpt": 0, "ckpt_prefix": 0, "your_sharded_checkpoint_fil": 0, "ckpt_suffix": 0, "_rank": 0, "inspir": 0, "fairscal": 0, "fullyshardeddataparallel": 0, "readthedoc": 0, "en": 0, "explicit": 0, "resort": 0, "approach": [0, 1], "newer": 0, "recurs": 0, "98": 0, "accuraci": 0, "drop_last": 0, "use_nested_fsdp": 0, "use_gradient_checkpoint": 0, "final_ckpt": 0, "75": 0, "1k": 0, "datadir": 0, "test_set_batch_s": 0, "eval_interv": 0, "num_warmup_epoch": 0, "lr_scheduler_divide_every_n_epoch": 0, "lr_scheduler_divisor": 0, "residu": 0, "algorithm": 0, "vision": 0, "vit": 0, "against": 0, "abil": [0, 1], "And": [0, 1], "who": 0, "eager": [0, 1], "mode": [0, 1], "know": [0, 1], "nccl": 0, "new_rank": 0, "ddp_model": 0, "final": 0, "launcher": 0, "demo_fn": 0, "everyth": [0, 1], "touch": 0, "plu": 0, "five": 0, "sy": 0, "tempfil": 0, "cleanup": 0, "destroy_process_group": 0, "toymodel": 0, "__init__": 0, "net1": 0, "1000000": 0, "relu": 0, "net2": 0, "demo_bas": 0, "assert": 0, "graident_as_bucket_view": 0, "label": 0, "run_demo": 0, "tot": 0, "statist": 0, "produc": [0, 1], "median": 0, "90th": 0, "std": 0, "cv": 0, "418": 0, "54": 0, "419": 0, "22": 0, "430": 0, "40": 0, "76": 0, "02": 0, "97": 0, "407": 0, "60": 0, "39": 0, "seem": 0, "extra": 0, "17864": 0, "20108": 0, "96": 0, "24351": 0, "74": 0, "5866": 0, "83": 0, "10701": 0, "11770": 0, "00": 0, "14313": 0, "78": 0, "3102": 0, "92": 0, "round": 0, "sens": 0, "amort": 0, "logdir": 0, "converg": 0, "achiev": 0, "48": 0, "caution": 0, "interest": 0, "enforc": [0, 1], "crash": 0, "nvidia": 0, "driver": 0, "publish": 0, "prebuilt": 0, "cuda11": 0, "config": 0, "central1": 0, "pkg": 0, "nightly_3": 0, "8_cuda_12": 0, "toolkit": 0, "datacent": 0, "latest": 0, "curl": 0, "fssl": 0, "libnvidia": 0, "gpgkei": 0, "gpg": 0, "dearmor": 0, "o": 0, "keyr": 0, "l": 0, "deb": 0, "sed": 0, "sign": 0, "tee": 0, "d": [0, 1], "ctk": 0, "systemctl": 0, "shm": 0, "16g": 0, "bin": 0, "exec": 0, "ps": 0, "awk": 0, "nr": 0, "visibl": [0, 1], "smi": 0, "verifi": 0, "20ab2c7a2d06": 0, "dec": 0, "06": 0, "24": 0, "2022": 0, "510": 0, "47": 0, "03": 0, "bu": 0, "disp": 0, "volatil": 0, "uncorr": 0, "ecc": 0, "fan": 0, "temp": 0, "perf": [0, 1], "pwr": 0, "cap": 0, "mig": 0, "tesla": 0, "v100": 0, "sxm2": 0, "00000000": 0, "36c": 0, "p0": 0, "38w": 0, "300w": 0, "0mib": 0, "16384mib": 0, "gi": 0, "pid": 0, "ld_library_path": 0, "echo": 0, "link": 0, "bashrc": 0, "lib64": 0, "manylinux_2_28_x86_64": 0, "prepar": 0, "begin": 0, "38": 0, "89059": 0, "82": 0, "globalr": 0, "79297": 0, "117": 0, "45": 0, "84": 0, "36": 0, "43628": 0, "281": 0, "83108": 0, "346": 0, "88": 0, "108": 0, "99023": 0, "373": 0, "132": 0, "56": 0, "92699": 0, "384": 0, "152": 0, "14": 0, "120": 0, "68816": 0, "388": 0, "169": 0, "09": 0, "use_cuda": 0, "xla_cuda": 0, "successfulli": 0, "gspmd": 0, "annot": 0, "ml": 0, "proper": 0, "hint": 0, "figur": 0, "strategi": 0, "non": [0, 1], "suit": 0, "framework": [0, 1], "popular": 0, "shift": 0, "unlock": 0, "advanc": 0, "use_spmd": 0, "b8b484515a97f74e013dcf38125c44d53a41f011": 0, "l214": 0, "is_spmd": 0, "mechan": 0, "mark_shard": 0, "src": 0, "union": 0, "partition_spec": 0, "logic": [0, 1], "xlatensor": 0, "rfc": 0, "np": 0, "xs": 0, "mesh_shap": 0, "num_devic": 0, "global_runtime_device_count": 0, "device_id": 0, "m1_shard": 0, "isinst": 0, "techniqu": 0, "comment": [0, 1], "simplelinear": 0, "fc1": 0, "enumer": 0, "sharidng": 0, "spatial": 0, "ouput": 0, "cluster": 0, "interconnect": 0, "deriv": 0, "sub": 0, "abstract": 0, "axis_nam": 0, "nad": 0, "get_logical_mesh": 0, "ordereddict": 0, "tile": 0, "intend": 0, "manipul": 0, "nice": 0, "arrang": 0, "involv": [0, 1], "center": 0, "dcn": 0, "hybridmesh": 0, "box": 0, "multislic": 0, "accept": 0, "ici_mesh_shap": 0, "dcn_mesh_shap": 0, "denot": 0, "device_mesh": 0, "axi": 0, "input_tensor": 0, "evenli": 0, "equival": 0, "3d": 0, "2d": [0, 1], "two_d_partially_repl": 0, "two_d_fully_shard": 0, "4d": 0, "four_d": 0, "delai": 0, "carri": [0, 1], "lazili": [0, 1], "satisfi": 0, "subclass": 0, "__torch_dispatch__": 0, "spmdpartition": 0, "global_tensor": 0, "strictli": 0, "local_shard": 0, "xlashard": 0, "4e8e5511555073ce8b6d1a436bf808c9333dcac6": 0, "xla_sharded_tensor": 0, "l12": 0, "ongo": 0, "effort": [0, 1], "distributedtensor": 0, "prototyp": 0, "proof": 0, "concept": 0, "distribute_tensor": 0, "devicemesh": 0, "big_tensor": 0, "100000": 0, "my_dtensor": 0, "tune": 0, "_input_sharding_": 0, "field": 0, "input_mesh": 0, "dedic": 0, "planner": 0, "spmdsaveplann": 0, "spmdloadplann": 0, "dist_cp": 0, "distributed_checkpoint": 0, "xc": 0, "storage_writ": 0, "filesystemwrit": 0, "checkpoint_dir": 0, "storage_read": 0, "filesystemread": 0, "manag": 0, "all_step": 0, "save_async": 0, "unblock": 0, "dispatch": 0, "preemption": 0, "detect": 0, "termin": 0, "provis": 0, "queuedresourc": 0, "autocheckpoint": 0, "chkpt_on_preempt": 0, "fsspec": 0, "filesystem": 0, "gc": 0, "chkpt_mgr": 0, "gs": 0, "my": [0, 1], "bucket": [0, 1], "tracked_step": 0, "choos": 0, "highest": 0, "best_step": 0, "discov": 0, "_after": 0, "_the": 0, "unnecessari": 0, "forth": 0, "decis": 0, "hardcod": 0, "leverag": 0, "num_gpu_machin": 0, "rank_of_current_machin": 0, "machine_0_ip_address": 0, "training_or_inference_script_using_spmd": 0, "xla_use_spmd": 0, "test_train_spmd_imagenet": 0, "0th": 0, "w": 0, "h": 0, "highli": 0, "param": 0, "named_paramet": 0, "quick": 0, "plai": 0, "512": 0, "throughput": 0, "am": 0, "roughli": 0, "4x": 0, "placement": 0, "visualize_shard": 0, "rich": 0, "generated_t": 0, "use_color": 0, "alt_text": 0, "asset": 0, "spmd_debug_1": 0, "png": 0, "style": 0, "partial_repl": 0, "envvar": 0, "xla_spmd_auto": 0, "_tensor": 0, "auto_polici": 0, "device_count": 0, "mymodul": 0, "sharded_model": 0, "behvaior": 0, "xla_auto_use_group_shard": 0, "reshard": 0, "xla_auto_spmd_mesh": 0, "unset": 0, "fsdpv2": 0, "epxress": 0, "famou": 0, "aim": [0, 1], "offer": 0, "enjoi": 0, "bring": 0, "tabl": 0, "review": 0, "spmd_fully_sharded_data_parallel": 0, "spmdfullyshardeddataparallel": 0, "autowrap": 0, "shard_output": 0, "categori": 0, "element": [0, 1], "logit": 0, "children": 0, "infinit": 0, "compil": 1, "xla": 1, "ar": 1, "static": 1, "word": 1, "chang": 1, "hurt": 1, "understand": 1, "debug": 1, "user": 1, "pov": 1, "sai": 1, "support": 1, "assur": 1, "gone": 1, "coverag": 1, "detail": 1, "explan": 1, "common": 1, "do": 1, "get": 1, "rid": 1, "mainli": 1, "problem": 1, "beginn": 1, "propos": 1, "here": 1, "reli": 1, "impract": 1, "assumpt": 1, "ye": 1, "exampl": 1, "differ": 1, "sentenc": 1, "length": 1, "ll": 1, "new": 1, "more": 1, "kinda": 1, "anti": 1, "pytorch": 1, "frontend": 1, "lazi": 1, "matter": 1, "cpu": 1, "workaround": 1, "okai": 1, "teach": 1, "data": 1, "torch": 1, "theoret": 1, "memori": 1, "sort": 1, "devic": 1, "node": 1, "obviou": 1, "s64": 1, "8": 1, "num_output": 1, "10": 1, "mul": 1, "inde": 1, "_get_xla_tensor_dimension_s": 1, "commonli": 1, "basic": 1, "correct": 1, "wrong": 1, "wors": 1, "take": 1, "probabl": 1, "how": 1, "upper": 1, "nit": 1, "api": 1, "rand": 1, "solv": 1, "should": 1, "kept": 1, "earli": 1, "accessor": 1, "implicitli": 1, "current": 1, "overload": 1, "IS": 1, "A": 1, "explod": 1, "cheap": 1, "ve": 1, "variant": 1, "hoc": 1, "think": 1, "verison": 1, "x": 1, "bla": 1, "blabla": 1, "execut": 1, "proce": 1, "choic": 1, "multipl": 1, "combin": 1, "wide": 1, "adopt": 1, "code": 1, "uglier": 1, "win": 1, "pars": 1, "automat": 1, "torchscript": 1, "somehow": 1, "merg": 1, "properli": 1, "haven": 1, "thought": 1, "trivial": 1, "side": 1, "That": 1, "why": 1, "hit": 1, "bandwidth": 1, "automag": 1, "gold": 1, "smart": 1, "trick": 1, "tbh": 1, "longer": 1, "y": 1, "The": 1, "unawar": 1, "hope": 1, "smash": 1, "an": 1, "blocker": 1, "ahead": 1, "nnc": 1, "symbol": 1, "optim": 1, "concret": 1, "kernel": 1, "exactli": 1, "transpos": 1, "With": 1, "brian": 1, "hirsh": 1, "bdhirsh": 1, "question": 1, "move": 1, "Is": 1, "stick": 1, "torch_warn": 1, "yea": 1, "hei": 1, "won": 1, "run": 1, "blaze": 1, "fast": 1, "isn": 1, "model": 1, "devirtu": 1, "sound": 1, "great": 1, "truth": 1, "irvalu": 1, "discrep": 1, "followup": 1, "1000": 1, "singl": 1, "presenc": 1, "get_dimention_s": 1, "didn": 1, "condit": 1, "exponenti": 1, "blowup": 1, "fewer": 1, "opportun": 1, "feasibl": 1, "annoi": 1, "variabl": 1, "wasn": 1, "materiz": 1, "combo": 1}, "objects": {"torch_xla.core": [[0, 0, 0, "-", "functions"], [0, 0, 0, "-", "xla_model"]], "torch_xla.core.functions": [[0, 1, 1, "", "all_gather"], [0, 1, 1, "", "all_reduce"]], "torch_xla.core.xla_model": [[0, 1, 1, "", "add_step_closure"], [0, 1, 1, "", "all_gather"], [0, 1, 1, "", "all_reduce"], [0, 1, 1, "", "all_to_all"], [0, 1, 1, "", "do_on_ordinals"], [0, 1, 1, "", "get_local_ordinal"], [0, 1, 1, "", "get_memory_info"], [0, 1, 1, "", "get_ordinal"], [0, 1, 1, "", "get_rng_state"], [0, 1, 1, "", "get_stablehlo"], [0, 1, 1, "", "get_stablehlo_bytecode"], [0, 1, 1, "", "get_xla_supported_devices"], [0, 1, 1, "", "is_master_ordinal"], [0, 1, 1, "", "mesh_reduce"], [0, 1, 1, "", "optimizer_step"], [0, 1, 1, "", "rendezvous"], [0, 1, 1, "", "save"], [0, 1, 1, "", "set_rng_state"], [0, 1, 1, "", "wait_device_ops"], [0, 1, 1, "", "xla_device"], [0, 1, 1, "", "xla_device_hw"], [0, 1, 1, "", "xrt_world_size"]], "torch_xla.distributed": [[0, 0, 0, "-", "parallel_loader"], [0, 0, 0, "-", "xla_multiprocessing"]], "torch_xla.distributed.parallel_loader": [[0, 2, 1, "", "ParallelLoader"]], "torch_xla.distributed.parallel_loader.ParallelLoader": [[0, 3, 1, "", "per_device_loader"]], "torch_xla.distributed.xla_multiprocessing": [[0, 2, 1, "", "MpModelWrapper"], [0, 2, 1, "", "MpSerialExecutor"], [0, 1, 1, "", "spawn"]], "torch_xla.distributed.xla_multiprocessing.MpModelWrapper": [[0, 3, 1, "", "to"]], "torch_xla.distributed.xla_multiprocessing.MpSerialExecutor": [[0, 3, 1, "", "run"]], "torch_xla.utils": [[0, 0, 0, "-", "serialization"], [0, 0, 0, "-", "utils"]], "torch_xla.utils.serialization": [[0, 1, 1, "", "load"], [0, 1, 1, "", "save"]], "torch_xla.utils.utils": [[0, 2, 1, "", "DataWrapper"], [0, 2, 1, "", "SampleGenerator"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"pytorch": 0, "xla": 0, "devic": 0, "creat": 0, "an": 0, "tensor": [0, 1], "ar": 0, "run": 0, "model": 0, "singl": 0, "multipl": 0, "multi": 0, "process": 0, "tpu": 0, "pod": 0, "deep": 0, "dive": 0, "lazi": 0, "bfloat16": 0, "memori": 0, "layout": 0, "move": 0, "from": [0, 1], "cpu": 0, "save": 0, "load": 0, "compil": 0, "cach": 0, "further": 0, "read": 0, "api": 0, "xla_model": 0, "distribut": 0, "util": 0, "test": 0, "beginn": 0, "s": [0, 1], "guid": 0, "basic": 0, "high": 0, "level": 0, "understand": 0, "some": [0, 1], "detail": 0, "setup": 0, "convert": 0, "code": 0, "exampl": 0, "1": [0, 1], "stabl": 0, "diffus": 0, "infer": 0, "lightn": 0, "2": [0, 1], "hf": 0, "profil": 0, "perform": 0, "analysi": 0, "troubleshoot": 0, "saniti": 0, "check": 0, "version": 0, "A": 0, "simpl": 0, "calcul": 0, "resnet": 0, "With": 0, "fake": 0, "data": 0, "debug": 0, "tool": 0, "dynamo": 0, "auto": 0, "metric": 0, "execut": 0, "get": 0, "report": 0, "The": 0, "clear": 0, "known": 0, "caveat": 0, "quirk": 0, "more": 0, "environ": 0, "variabl": 0, "common": 0, "combin": 0, "reproduc": 0, "ci": 0, "cd": 0, "unit": 0, "failur": 0, "pjrt": 0, "runtim": 0, "tl": 0, "dr": 0, "benefit": 0, "quickstart": 0, "docker": 0, "gpu": 0, "node": 0, "train": 0, "differ": 0, "xrt": 0, "multithread": 0, "v2": 0, "v3": 0, "chang": 0, "xm": 0, "rendezv": 0, "torch": 0, "new": 0, "torchdynamo": 0, "integr": 0, "featur": 0, "gap": 0, "take": 0, "awai": 0, "fulli": 0, "shard": 0, "parallel": 0, "fsdp": 0, "script": 0, "mnist": 0, "imagenet": 0, "instal": 0, "clone": 0, "repo": 0, "8": 0, "50": 0, "10": 0, "billion": 0, "paramet": 0, "how": 0, "do": 0, "distributeddataparallel": 0, "background": 0, "motiv": 0, "us": [0, 1], "benchmark": 0, "resnet50": 0, "real": [0, 1], "disclaim": 0, "instanc": 0, "wheel": 0, "amp": 0, "automat": 0, "mix": 0, "precis": 0, "develop": 0, "build": 0, "sourc": [0, 1], "support": 0, "spmd": 0, "user": 0, "what": [0, 1], "design": 0, "overview": 0, "aannot": 0, "mesh": 0, "hybrid": 0, "partit": 0, "spec": 0, "xlashardedtensor": 0, "dtensor": 0, "awar": 0, "host": 0, "checkpoint": 0, "checkpointmanag": 0, "group": 0, "virtual": 0, "optim": 0, "number": 0, "refer": 0, "express": 0, "here": 0, "2x2": 0, "ax": 0, "x": 0, "y": 0, "can": [0, 1], "visual": 0, "visualize_tensor_shard": 0, "method": 0, "current": 0, "should": 0, "via": 0, "distribute_modul": 0, "output": [0, 1], "gradient": 0, "huggingfac": 0, "llama": 0, "recompil": 1, "torch_xla": 1, "let": 1, "first": 1, "start": 1, "fact": 1, "constraint": 1, "input": 1, "dataset": 1, "oper": 1, "bound": 1, "dynam": 1, "shape": 1, "fix": 1, "case": 1, "when": 1, "you": 1, "without": 1, "queri": 1, "its": 1, "dimens": 1, "3": 1, "control": 1, "flow": 1, "conclus": 1, "appendix": 1}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}})
\ No newline at end of file