diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet index 8bbe599b..dd770ec6 100644 --- a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet @@ -104,7 +104,7 @@ local volumes = import 'templates/volumes.libsonnet'; sudo apt install -y libsndfile-dev pip3 install --user --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu pip install --user \ - 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' \ + 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl' \ -f https://storage.googleapis.com/libtpu-releases/index.html \ -f https://storage.googleapis.com/libtpu-wheels/index.html pip3 install pillow diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/accelerate-smoke.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/accelerate-smoke.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/ci.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/ci.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/ci.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/ci.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/common.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet similarity index 87% rename from dags/legacy_test/tests/pytorch/r2.5.1/common.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/common.libsonnet index 524cc733..84791bff 100644 --- a/dags/legacy_test/tests/pytorch/r2.5.1/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet @@ -19,14 +19,14 @@ local utils = import 'templates/utils.libsonnet'; local volumes = import 'templates/volumes.libsonnet'; { - local r2_5_1 = { - frameworkPrefix: 'pt-2-5-1', + local r2_6 = { + frameworkPrefix: 'pt-2-6', tpuSettings+: { softwareVersion: 'tpu-ubuntu2204-base', }, - imageTag: 'r2.5.1_3.10', + imageTag: 'r2.6.0-rc1_3.10', }, - PyTorchTest:: common.PyTorchTest + r2_5_1 { + PyTorchTest:: common.PyTorchTest + r2_6 { local config = self, podTemplate+:: { @@ -67,7 +67,7 @@ local volumes = import 'templates/volumes.libsonnet'; ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)')) ctc.wait_for_healthy() - ctc.configure_tpu_version(f'pytorch-2.5.1-dev{libtpu_date}', restart_type='always') + ctc.configure_tpu_version(f'pytorch-2.6-dev{libtpu_date}', restart_type='always') ctc.wait_for_healthy() |||, ], @@ -102,16 +102,16 @@ local volumes = import 'templates/volumes.libsonnet'; sudo apt install -y libopenblas-base # for huggingface tests sudo apt install -y libsndfile-dev - # Install torchvision by pinned commit in PyTorch 2.5.1 release branch. - pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu - # torchvision commit reference: https://github.com/pytorch/pytorch/blob/v2.5.1/.github/ci_commit_pins/vision.txt + # Install torchvision by pinned commit in PyTorch 2.6 release branch. + pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu + # torchvision commit reference: https://github.com/pytorch/pytorch/blob/release/2.6/.github/ci_commit_pins/vision.txt pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f" - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl - pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html + pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl + pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html pip install pillow git clone --depth=1 https://github.com/pytorch/pytorch.git cd pytorch - git clone -b v2.5.1 https://github.com/pytorch/xla.git + git clone -b r2.6 https://github.com/pytorch/xla.git |||, }, podTemplate+:: { @@ -147,12 +147,12 @@ local volumes = import 'templates/volumes.libsonnet'; nvidia-smi pip uninstall -y torch torchvision - pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu + pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f" - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl + pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl mkdir -p pytorch/xla - git clone -b v2.5.1 https://github.com/pytorch/xla.git pytorch/xla + git clone -b r2.6 https://github.com/pytorch/xla.git pytorch/xla %s @@ -224,5 +224,5 @@ local volumes = import 'templates/volumes.libsonnet'; }, // DEPRECATED: Use PyTorchTpuVmMixin instead - tpu_vm_r2_5_1_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, + tpu_vm_r2_6_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, } diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/hf-bert.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/hf-bert.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/hf-diffusers.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/hf-diffusers.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/llama2-model.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/llama2-model.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/mnist.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/mnist.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/mnist.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/mnist.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/resnet50-mp.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/resnet50-mp.libsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/resnet50-mp.libsonnet rename to dags/legacy_test/tests/pytorch/r2.6/resnet50-mp.libsonnet diff --git a/dags/legacy_test/tests/pytorch/r2.5.1/targets.jsonnet b/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet similarity index 100% rename from dags/legacy_test/tests/pytorch/r2.5.1/targets.jsonnet rename to dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet diff --git a/dags/legacy_test/tests/pytorch/targets.jsonnet b/dags/legacy_test/tests/pytorch/targets.jsonnet index 36c21372..f543b25b 100644 --- a/dags/legacy_test/tests/pytorch/targets.jsonnet +++ b/dags/legacy_test/tests/pytorch/targets.jsonnet @@ -13,10 +13,10 @@ // limitations under the License. local nightly = import 'nightly/targets.jsonnet'; -local r2_5_1 = import 'r2.5.1/targets.jsonnet'; +local r2_6 = import 'r2.6/targets.jsonnet'; // Add new versions here std.flattenArrays([ nightly, - r2_5_1, + r2_6, ]) diff --git a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py index d490456a..475122c0 100644 --- a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py +++ b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py @@ -32,13 +32,14 @@ class VERSION(enum.Enum): R2_4 = enum.auto() R2_5 = enum.auto() R2_5_1 = enum.auto() + R2_6 = enum.auto() class VERSION_MAPPING: class NIGHTLY(enum.Enum): - TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl" - TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl" + TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl" + TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl" TORCH = "torch" TORCHVISION = "torchvision" TORCHAUDIO = "torchaudio" @@ -109,6 +110,18 @@ class R2_5_1(enum.Enum): TORCH_REPO_BRANCH = "-b v2.5.1" TORCH_XLA_REPO_BRANCH = "-b v2.5.1" + class R2_6(enum.Enum): + TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl" + TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0rc1-cp310-cp310-linux_x86_64.whl" + TORCH = "torch==2.6.0" + TORCHVISION = "torchvision==0.20.1" + TORCHAUDIO = "torchaudio==2.6.0" + TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0rc1_3.10_cuda_12.1" + TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/test/cpu" + TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/test/cu121" + TORCH_REPO_BRANCH = "-b release/2.6" + TORCH_XLA_REPO_BRANCH = "-b r2.6" + def get_version_mapping(test_version): """Get version dependecies based on version type. @@ -133,6 +146,8 @@ def get_version_mapping(test_version): version_mapping = VERSION_MAPPING.R2_5 elif test_version == VERSION.R2_5_1: version_mapping = VERSION_MAPPING.R2_5_1 + elif test_version == VERSION.R2_6: + version_mapping = VERSION_MAPPING.R2_6 else: raise ValueError("version number does not exist in VERSION enum") return version_mapping diff --git a/dags/pytorch_xla/pytorchxla-torchbench-release.py b/dags/pytorch_xla/pytorchxla-torchbench-release.py index b1ad0b6a..722a4d56 100644 --- a/dags/pytorch_xla/pytorchxla-torchbench-release.py +++ b/dags/pytorch_xla/pytorchxla-torchbench-release.py @@ -32,7 +32,7 @@ ) as dag: model = "all" if composer_env.is_prod_env() else "BERT_pytorch" torchbench_extra_flags = [f"--filter={model}"] - test_version = config.VERSION.R2_5_1 + test_version = config.VERSION.R2_6 # Running on V4-8: config.get_torchbench_tpu_config( tpu_version=resource.TpuVersion.V4, diff --git a/dags/pytorch_xla/r2_5_1.py b/dags/pytorch_xla/r2_6.py similarity index 81% rename from dags/pytorch_xla/r2_5_1.py rename to dags/pytorch_xla/r2_6.py index ed54bad0..3dcb54ed 100644 --- a/dags/pytorch_xla/r2_5_1.py +++ b/dags/pytorch_xla/r2_6.py @@ -69,13 +69,13 @@ def torchvision(): mnist_v2_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-mnist-pjrt-func-v2-8-1vm" + "pt-2-6-mnist-pjrt-func-v2-8-1vm" ), US_CENTRAL1_C, ) resnet_v2_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-resnet50-pjrt-fake-v2-8-1vm", + "pt-2-6-resnet50-pjrt-fake-v2-8-1vm", reserved=True, ), US_CENTRAL1_C, @@ -86,8 +86,8 @@ def torchvision(): US_EAST1_D, ) for test in ( - "pt-2-5-1-resnet50-pjrt-fake-v3-8-1vm", - "pt-2-5-1-resnet50-pjrt-ddp-fake-v3-8-1vm", + "pt-2-6-resnet50-pjrt-fake-v3-8-1vm", + "pt-2-6-resnet50-pjrt-ddp-fake-v3-8-1vm", ) ] resnet_v4_8_tests = [ @@ -96,21 +96,21 @@ def torchvision(): US_CENTRAL2_B, ) for test in ( - "pt-2-5-1-resnet50-pjrt-fake-v4-8-1vm", - "pt-2-5-1-resnet50-pjrt-ddp-fake-v4-8-1vm", - "pt-2-5-1-resnet50-spmd-batch-fake-v4-8-1vm", - "pt-2-5-1-resnet50-spmd-spatial-fake-v4-8-1vm", + "pt-2-6-resnet50-pjrt-fake-v4-8-1vm", + "pt-2-6-resnet50-pjrt-ddp-fake-v4-8-1vm", + "pt-2-6-resnet50-spmd-batch-fake-v4-8-1vm", + "pt-2-6-resnet50-spmd-spatial-fake-v4-8-1vm", ) ] resnet_v4_32 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-resnet50-pjrt-fake-v4-32-1vm" + "pt-2-6-resnet50-pjrt-fake-v4-32-1vm" ), US_CENTRAL2_B, ) resnet_v5lp_4 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-resnet50-pjrt-fake-v5litepod-4-1vm", + "pt-2-6-resnet50-pjrt-fake-v5litepod-4-1vm", network=V5_NETWORKS, subnetwork=V5E_SUBNETWORKS, reserved=True, @@ -122,16 +122,14 @@ def torchvision(): resnet_v2_8 >> resnet_v3_8_tests resnet_v100_2x2 = task.GpuGkeTask( - test_config.GpuGkeTest.from_pytorch( - "pt-2-5-1-resnet50-mp-fake-v100-x2x2" - ), + test_config.GpuGkeTest.from_pytorch("pt-2-6-resnet50-mp-fake-v100-x2x2"), US_CENTRAL1, "gpu-uc1", ).run() resnet_v100_2x2_spmd = task.GpuGkeTask( test_config.GpuGkeTest.from_pytorch( - "pt-2-5-1-resnet50-spmd-batch-fake-v100-x2x2" + "pt-2-6-resnet50-spmd-batch-fake-v100-x2x2" ), US_CENTRAL1, "gpu-uc1", @@ -144,19 +142,19 @@ def torchvision(): def huggingface(): accelerate_v2_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-accelerate-smoke-v2-8-1vm", reserved=True + "pt-2-6-accelerate-smoke-v2-8-1vm", reserved=True ), US_CENTRAL1_C, ) accelerate_v4_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-accelerate-smoke-v4-8-1vm" + "pt-2-6-accelerate-smoke-v4-8-1vm" ), US_CENTRAL2_B, ) diffusers_v4_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-hf-diffusers-func-v4-8-1vm" + "pt-2-6-hf-diffusers-func-v4-8-1vm" ), US_CENTRAL2_B, ) @@ -166,7 +164,7 @@ def huggingface(): task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-hf-bert-pjrt-func-v4-8-1vm" + "pt-2-6-hf-bert-pjrt-func-v4-8-1vm" ), US_CENTRAL2_B, ) @@ -176,19 +174,19 @@ def huggingface(): def llama(): llama_inference_v4_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama2-infer-func-v4-8-1vm" + "pt-2-6-llama2-infer-func-v4-8-1vm" ), US_CENTRAL2_B, ) llama_train_v4_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama2-train-spmd-func-v4-8-1vm" + "pt-2-6-llama2-train-spmd-func-v4-8-1vm" ), US_CENTRAL2_B, ) llama_2_inference_v5_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama2-infer-func-v5p-8-1vm", + "pt-2-6-llama2-infer-func-v5p-8-1vm", reserved=True, network=V5_NETWORKS, subnetwork=V5P_SUBNETWORKS, @@ -197,7 +195,7 @@ def llama(): ) llama_2_train_v5p_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama2-train-spmd-func-v5p-8-1vm", + "pt-2-6-llama2-train-spmd-func-v5p-8-1vm", reserved=True, network=V5_NETWORKS, subnetwork=V5P_SUBNETWORKS, @@ -206,7 +204,7 @@ def llama(): ) llama_3_train_trillium = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama3-train-func-v6e-4-1vm", + "pt-2-6-llama3-train-func-v6e-4-1vm", network=V5_NETWORKS, subnetwork=V6E_SUBNETWORKS, ), @@ -214,7 +212,7 @@ def llama(): ) llama_3_train_v5p_8 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-llama3-train-func-v5p-8-1vm", + "pt-2-6-llama3-train-func-v5p-8-1vm", reserved=True, network=V5_NETWORKS, subnetwork=V5P_SUBNETWORKS, @@ -224,9 +222,9 @@ def llama(): with models.DAG( - dag_id="pytorchxla-r2-5-1", + dag_id="pytorchxla-r2-6", schedule=SCHEDULED_TIME, - tags=["pytorchxla", "r2-5-1", "supported", "xlml"], + tags=["pytorchxla", "r2-6", "supported", "xlml"], start_date=datetime.datetime(2023, 7, 12), catchup=False, ): @@ -236,7 +234,7 @@ def llama(): resnet_v5lp_4 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-ci-func-v5litepod-4-1vm", + "pt-2-6-ci-func-v5litepod-4-1vm", network=V5_NETWORKS, subnetwork=V5E_SUBNETWORKS, reserved=True, @@ -246,7 +244,7 @@ def llama(): ci_trillium_4 = task.run_queued_resource_test( test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-5-1-ci-func-v6e-4-1vm", + "pt-2-6-ci-func-v6e-4-1vm", network=V5_NETWORKS, subnetwork=V6E_SUBNETWORKS, ),