Update PyTorch/XLA nightly and 2.6rc tests (#505)

I referenced these changes: - https://github.com/GoogleCloudPlatform/ml-auto-solutions/pull/391/files - https://github.com/GoogleCloudPlatform/ml-auto-solutions/pull/380/files
GoogleCloudPlatform · Dec 13, 2024 · 1c6bbe4 · 1c6bbe4
1 parent 27821e2
commit 1c6bbe4
Show file tree

Hide file tree

Showing 14 changed files with 62 additions and 49 deletions.
diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet
@@ -104,7 +104,7 @@ local volumes = import 'templates/volumes.libsonnet';
         sudo apt install -y libsndfile-dev
         pip3 install --user --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
         pip install --user \
-          'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' \
+          'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl' \
           -f https://storage.googleapis.com/libtpu-releases/index.html \
           -f https://storage.googleapis.com/libtpu-wheels/index.html
         pip3 install pillow

diff --git a/...pytorch/r2.5.1/accelerate-smoke.libsonnet → ...s/pytorch/r2.6/accelerate-smoke.libsonnet b/...pytorch/r2.5.1/accelerate-smoke.libsonnet → ...s/pytorch/r2.6/accelerate-smoke.libsonnet
diff --git a/...cy_test/tests/pytorch/r2.5.1/ci.libsonnet → ...gacy_test/tests/pytorch/r2.6/ci.libsonnet b/...cy_test/tests/pytorch/r2.5.1/ci.libsonnet → ...gacy_test/tests/pytorch/r2.6/ci.libsonnet
diff --git a/...est/tests/pytorch/r2.5.1/common.libsonnet → ..._test/tests/pytorch/r2.6/common.libsonnet b/...est/tests/pytorch/r2.5.1/common.libsonnet → ..._test/tests/pytorch/r2.6/common.libsonnet
@@ -19,14 +19,14 @@ local utils = import 'templates/utils.libsonnet';
 local volumes = import 'templates/volumes.libsonnet';
 
 {
-  local r2_5_1 = {
-    frameworkPrefix: 'pt-2-5-1',
+  local r2_6 = {
+    frameworkPrefix: 'pt-2-6',
     tpuSettings+: {
       softwareVersion: 'tpu-ubuntu2204-base',
     },
-    imageTag: 'r2.5.1_3.10',
+    imageTag: 'r2.6.0-rc1_3.10',
   },
-  PyTorchTest:: common.PyTorchTest + r2_5_1 {
+  PyTorchTest:: common.PyTorchTest + r2_6 {
     local config = self,
 
     podTemplate+:: {
@@ -67,7 +67,7 @@ local volumes = import 'templates/volumes.libsonnet';
 
                 ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)'))
                 ctc.wait_for_healthy()
-                ctc.configure_tpu_version(f'pytorch-2.5.1-dev{libtpu_date}', restart_type='always')
+                ctc.configure_tpu_version(f'pytorch-2.6-dev{libtpu_date}', restart_type='always')
                 ctc.wait_for_healthy()
               |||,
             ],
@@ -102,16 +102,16 @@ local volumes = import 'templates/volumes.libsonnet';
         sudo apt install -y libopenblas-base
         # for huggingface tests
         sudo apt install -y libsndfile-dev
-        # Install torchvision by pinned commit in PyTorch 2.5.1 release branch.
-        pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu
-        # torchvision commit reference: https://github.com/pytorch/pytorch/blob/v2.5.1/.github/ci_commit_pins/vision.txt
+        # Install torchvision by pinned commit in PyTorch 2.6 release branch.
+        pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu
+        # torchvision commit reference: https://github.com/pytorch/pytorch/blob/release/2.6/.github/ci_commit_pins/vision.txt
         pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f"
-        pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl
-        pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+        pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl
+        pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html
         pip install pillow
         git clone --depth=1 https://github.com/pytorch/pytorch.git
         cd pytorch
-        git clone -b v2.5.1 https://github.com/pytorch/xla.git
+        git clone -b r2.6 https://github.com/pytorch/xla.git
       |||,
     },
     podTemplate+:: {
@@ -147,12 +147,12 @@ local volumes = import 'templates/volumes.libsonnet';
 
         nvidia-smi
         pip uninstall -y torch torchvision
-        pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu
+        pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu
         pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f"
-        pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl
+        pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl
 
         mkdir -p pytorch/xla
-        git clone -b v2.5.1 https://github.com/pytorch/xla.git pytorch/xla
+        git clone -b r2.6 https://github.com/pytorch/xla.git pytorch/xla
 
         %s
 
@@ -224,5 +224,5 @@ local volumes = import 'templates/volumes.libsonnet';
   },
 
   // DEPRECATED: Use PyTorchTpuVmMixin instead
-  tpu_vm_r2_5_1_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup,
+  tpu_vm_r2_6_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup,
 }
diff --git a/...st/tests/pytorch/r2.5.1/hf-bert.libsonnet → ...test/tests/pytorch/r2.6/hf-bert.libsonnet b/...st/tests/pytorch/r2.5.1/hf-bert.libsonnet → ...test/tests/pytorch/r2.6/hf-bert.libsonnet
diff --git a/...sts/pytorch/r2.5.1/hf-diffusers.libsonnet → ...tests/pytorch/r2.6/hf-diffusers.libsonnet b/...sts/pytorch/r2.5.1/hf-diffusers.libsonnet → ...tests/pytorch/r2.6/hf-diffusers.libsonnet
diff --git a/...sts/pytorch/r2.5.1/llama2-model.libsonnet → ...tests/pytorch/r2.6/llama2-model.libsonnet b/...sts/pytorch/r2.5.1/llama2-model.libsonnet → ...tests/pytorch/r2.6/llama2-model.libsonnet
diff --git a/...test/tests/pytorch/r2.5.1/mnist.libsonnet → ...y_test/tests/pytorch/r2.6/mnist.libsonnet b/...test/tests/pytorch/r2.5.1/mnist.libsonnet → ...y_test/tests/pytorch/r2.6/mnist.libsonnet
diff --git a/...ests/pytorch/r2.5.1/resnet50-mp.libsonnet → .../tests/pytorch/r2.6/resnet50-mp.libsonnet b/...ests/pytorch/r2.5.1/resnet50-mp.libsonnet → .../tests/pytorch/r2.6/resnet50-mp.libsonnet
diff --git a/...test/tests/pytorch/r2.5.1/targets.jsonnet → ...y_test/tests/pytorch/r2.6/targets.jsonnet b/...test/tests/pytorch/r2.5.1/targets.jsonnet → ...y_test/tests/pytorch/r2.6/targets.jsonnet
diff --git a/dags/legacy_test/tests/pytorch/targets.jsonnet b/dags/legacy_test/tests/pytorch/targets.jsonnet
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 local nightly = import 'nightly/targets.jsonnet';
-local r2_5_1 = import 'r2.5.1/targets.jsonnet';
+local r2_6 = import 'r2.6/targets.jsonnet';
 
 // Add new versions here
 std.flattenArrays([
   nightly,
-  r2_5_1,
+  r2_6,
 ])
diff --git a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
@@ -32,13 +32,14 @@ class VERSION(enum.Enum):
   R2_4 = enum.auto()
   R2_5 = enum.auto()
   R2_5_1 = enum.auto()
+  R2_6 = enum.auto()
 
 
 class VERSION_MAPPING:
 
   class NIGHTLY(enum.Enum):
-    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl"
-    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl"
+    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl"
+    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl"
     TORCH = "torch"
     TORCHVISION = "torchvision"
     TORCHAUDIO = "torchaudio"
@@ -109,6 +110,18 @@ class R2_5_1(enum.Enum):
     TORCH_REPO_BRANCH = "-b v2.5.1"
     TORCH_XLA_REPO_BRANCH = "-b v2.5.1"
 
+  class R2_6(enum.Enum):
+    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl"
+    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0rc1-cp310-cp310-linux_x86_64.whl"
+    TORCH = "torch==2.6.0"
+    TORCHVISION = "torchvision==0.20.1"
+    TORCHAUDIO = "torchaudio==2.6.0"
+    TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0rc1_3.10_cuda_12.1"
+    TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/test/cpu"
+    TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/test/cu121"
+    TORCH_REPO_BRANCH = "-b release/2.6"
+    TORCH_XLA_REPO_BRANCH = "-b r2.6"
+
 
 def get_version_mapping(test_version):
   """Get version dependecies based on version type.
@@ -133,6 +146,8 @@ def get_version_mapping(test_version):
     version_mapping = VERSION_MAPPING.R2_5
   elif test_version == VERSION.R2_5_1:
     version_mapping = VERSION_MAPPING.R2_5_1
+  elif test_version == VERSION.R2_6:
+    version_mapping = VERSION_MAPPING.R2_6
   else:
     raise ValueError("version number does not exist in VERSION enum")
   return version_mapping

diff --git a/dags/pytorch_xla/pytorchxla-torchbench-release.py b/dags/pytorch_xla/pytorchxla-torchbench-release.py
@@ -32,7 +32,7 @@
 ) as dag:
   model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
   torchbench_extra_flags = [f"--filter={model}"]
-  test_version = config.VERSION.R2_5_1
+  test_version = config.VERSION.R2_6
   # Running on V4-8:
   config.get_torchbench_tpu_config(
       tpu_version=resource.TpuVersion.V4,

diff --git a/dags/pytorch_xla/r2_5_1.py → dags/pytorch_xla/r2_6.py b/dags/pytorch_xla/r2_5_1.py → dags/pytorch_xla/r2_6.py
@@ -69,13 +69,13 @@
 def torchvision():
   mnist_v2_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-mnist-pjrt-func-v2-8-1vm"
+          "pt-2-6-mnist-pjrt-func-v2-8-1vm"
       ),
       US_CENTRAL1_C,
   )
   resnet_v2_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-resnet50-pjrt-fake-v2-8-1vm",
+          "pt-2-6-resnet50-pjrt-fake-v2-8-1vm",
           reserved=True,
       ),
       US_CENTRAL1_C,
@@ -86,8 +86,8 @@ def torchvision():
           US_EAST1_D,
       )
       for test in (
-          "pt-2-5-1-resnet50-pjrt-fake-v3-8-1vm",
-          "pt-2-5-1-resnet50-pjrt-ddp-fake-v3-8-1vm",
+          "pt-2-6-resnet50-pjrt-fake-v3-8-1vm",
+          "pt-2-6-resnet50-pjrt-ddp-fake-v3-8-1vm",
       )
   ]
   resnet_v4_8_tests = [
@@ -96,21 +96,21 @@ def torchvision():
           US_CENTRAL2_B,
       )
       for test in (
-          "pt-2-5-1-resnet50-pjrt-fake-v4-8-1vm",
-          "pt-2-5-1-resnet50-pjrt-ddp-fake-v4-8-1vm",
-          "pt-2-5-1-resnet50-spmd-batch-fake-v4-8-1vm",
-          "pt-2-5-1-resnet50-spmd-spatial-fake-v4-8-1vm",
+          "pt-2-6-resnet50-pjrt-fake-v4-8-1vm",
+          "pt-2-6-resnet50-pjrt-ddp-fake-v4-8-1vm",
+          "pt-2-6-resnet50-spmd-batch-fake-v4-8-1vm",
+          "pt-2-6-resnet50-spmd-spatial-fake-v4-8-1vm",
       )
   ]
   resnet_v4_32 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-resnet50-pjrt-fake-v4-32-1vm"
+          "pt-2-6-resnet50-pjrt-fake-v4-32-1vm"
       ),
       US_CENTRAL2_B,
   )
   resnet_v5lp_4 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-resnet50-pjrt-fake-v5litepod-4-1vm",
+          "pt-2-6-resnet50-pjrt-fake-v5litepod-4-1vm",
           network=V5_NETWORKS,
           subnetwork=V5E_SUBNETWORKS,
           reserved=True,
@@ -122,16 +122,14 @@ def torchvision():
   resnet_v2_8 >> resnet_v3_8_tests
 
   resnet_v100_2x2 = task.GpuGkeTask(
-      test_config.GpuGkeTest.from_pytorch(
-          "pt-2-5-1-resnet50-mp-fake-v100-x2x2"
-      ),
+      test_config.GpuGkeTest.from_pytorch("pt-2-6-resnet50-mp-fake-v100-x2x2"),
       US_CENTRAL1,
       "gpu-uc1",
   ).run()
 
   resnet_v100_2x2_spmd = task.GpuGkeTask(
       test_config.GpuGkeTest.from_pytorch(
-          "pt-2-5-1-resnet50-spmd-batch-fake-v100-x2x2"
+          "pt-2-6-resnet50-spmd-batch-fake-v100-x2x2"
       ),
       US_CENTRAL1,
       "gpu-uc1",
@@ -144,19 +142,19 @@ def torchvision():
 def huggingface():
   accelerate_v2_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-accelerate-smoke-v2-8-1vm", reserved=True
+          "pt-2-6-accelerate-smoke-v2-8-1vm", reserved=True
       ),
       US_CENTRAL1_C,
   )
   accelerate_v4_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-accelerate-smoke-v4-8-1vm"
+          "pt-2-6-accelerate-smoke-v4-8-1vm"
       ),
       US_CENTRAL2_B,
   )
   diffusers_v4_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-hf-diffusers-func-v4-8-1vm"
+          "pt-2-6-hf-diffusers-func-v4-8-1vm"
       ),
       US_CENTRAL2_B,
   )
@@ -166,7 +164,7 @@ def huggingface():
 
   task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-hf-bert-pjrt-func-v4-8-1vm"
+          "pt-2-6-hf-bert-pjrt-func-v4-8-1vm"
       ),
       US_CENTRAL2_B,
   )
@@ -176,19 +174,19 @@ def huggingface():
 def llama():
   llama_inference_v4_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama2-infer-func-v4-8-1vm"
+          "pt-2-6-llama2-infer-func-v4-8-1vm"
       ),
       US_CENTRAL2_B,
   )
   llama_train_v4_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama2-train-spmd-func-v4-8-1vm"
+          "pt-2-6-llama2-train-spmd-func-v4-8-1vm"
       ),
       US_CENTRAL2_B,
   )
   llama_2_inference_v5_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama2-infer-func-v5p-8-1vm",
+          "pt-2-6-llama2-infer-func-v5p-8-1vm",
           reserved=True,
           network=V5_NETWORKS,
           subnetwork=V5P_SUBNETWORKS,
@@ -197,7 +195,7 @@ def llama():
   )
   llama_2_train_v5p_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama2-train-spmd-func-v5p-8-1vm",
+          "pt-2-6-llama2-train-spmd-func-v5p-8-1vm",
           reserved=True,
           network=V5_NETWORKS,
           subnetwork=V5P_SUBNETWORKS,
@@ -206,15 +204,15 @@ def llama():
   )
   llama_3_train_trillium = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama3-train-func-v6e-4-1vm",
+          "pt-2-6-llama3-train-func-v6e-4-1vm",
           network=V5_NETWORKS,
           subnetwork=V6E_SUBNETWORKS,
       ),
       US_CENTRAL2_B_TPU_PROD_ENV,
   )
   llama_3_train_v5p_8 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-llama3-train-func-v5p-8-1vm",
+          "pt-2-6-llama3-train-func-v5p-8-1vm",
           reserved=True,
           network=V5_NETWORKS,
           subnetwork=V5P_SUBNETWORKS,
@@ -224,9 +222,9 @@ def llama():
 
 
 with models.DAG(
-    dag_id="pytorchxla-r2-5-1",
+    dag_id="pytorchxla-r2-6",
     schedule=SCHEDULED_TIME,
-    tags=["pytorchxla", "r2-5-1", "supported", "xlml"],
+    tags=["pytorchxla", "r2-6", "supported", "xlml"],
     start_date=datetime.datetime(2023, 7, 12),
     catchup=False,
 ):
@@ -236,7 +234,7 @@ def llama():
 
   resnet_v5lp_4 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-ci-func-v5litepod-4-1vm",
+          "pt-2-6-ci-func-v5litepod-4-1vm",
           network=V5_NETWORKS,
           subnetwork=V5E_SUBNETWORKS,
           reserved=True,
@@ -246,7 +244,7 @@ def llama():
 
   ci_trillium_4 = task.run_queued_resource_test(
       test_config.JSonnetTpuVmTest.from_pytorch(
-          "pt-2-5-1-ci-func-v6e-4-1vm",
+          "pt-2-6-ci-func-v6e-4-1vm",
           network=V5_NETWORKS,
           subnetwork=V6E_SUBNETWORKS,
       ),