Skip to content

Commit

Permalink
Update PyTorch/XLA nightly and 2.6rc tests (#505)
Browse files Browse the repository at this point in the history
  • Loading branch information
tengyifei authored Dec 13, 2024
1 parent 27821e2 commit 1c6bbe4
Show file tree
Hide file tree
Showing 14 changed files with 62 additions and 49 deletions.
2 changes: 1 addition & 1 deletion dags/legacy_test/tests/pytorch/nightly/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ local volumes = import 'templates/volumes.libsonnet';
sudo apt install -y libsndfile-dev
pip3 install --user --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
pip install --user \
'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' \
'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl' \
-f https://storage.googleapis.com/libtpu-releases/index.html \
-f https://storage.googleapis.com/libtpu-wheels/index.html
pip3 install pillow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ local utils = import 'templates/utils.libsonnet';
local volumes = import 'templates/volumes.libsonnet';

{
local r2_5_1 = {
frameworkPrefix: 'pt-2-5-1',
local r2_6 = {
frameworkPrefix: 'pt-2-6',
tpuSettings+: {
softwareVersion: 'tpu-ubuntu2204-base',
},
imageTag: 'r2.5.1_3.10',
imageTag: 'r2.6.0-rc1_3.10',
},
PyTorchTest:: common.PyTorchTest + r2_5_1 {
PyTorchTest:: common.PyTorchTest + r2_6 {
local config = self,

podTemplate+:: {
Expand Down Expand Up @@ -67,7 +67,7 @@ local volumes = import 'templates/volumes.libsonnet';
ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)'))
ctc.wait_for_healthy()
ctc.configure_tpu_version(f'pytorch-2.5.1-dev{libtpu_date}', restart_type='always')
ctc.configure_tpu_version(f'pytorch-2.6-dev{libtpu_date}', restart_type='always')
ctc.wait_for_healthy()
|||,
],
Expand Down Expand Up @@ -102,16 +102,16 @@ local volumes = import 'templates/volumes.libsonnet';
sudo apt install -y libopenblas-base
# for huggingface tests
sudo apt install -y libsndfile-dev
# Install torchvision by pinned commit in PyTorch 2.5.1 release branch.
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu
# torchvision commit reference: https://github.com/pytorch/pytorch/blob/v2.5.1/.github/ci_commit_pins/vision.txt
# Install torchvision by pinned commit in PyTorch 2.6 release branch.
pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu
# torchvision commit reference: https://github.com/pytorch/pytorch/blob/release/2.6/.github/ci_commit_pins/vision.txt
pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f"
pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl
pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl
pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html
pip install pillow
git clone --depth=1 https://github.com/pytorch/pytorch.git
cd pytorch
git clone -b v2.5.1 https://github.com/pytorch/xla.git
git clone -b r2.6 https://github.com/pytorch/xla.git
|||,
},
podTemplate+:: {
Expand Down Expand Up @@ -147,12 +147,12 @@ local volumes = import 'templates/volumes.libsonnet';
nvidia-smi
pip uninstall -y torch torchvision
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cpu
pip install torch==2.6 --index-url https://download.pytorch.org/whl/test/cpu
pip install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@d23a6e1664d20707c11781299611436e1f0c104f"
pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.1-cp310-cp310-manylinux_2_28_x86_64.whl
pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl
mkdir -p pytorch/xla
git clone -b v2.5.1 https://github.com/pytorch/xla.git pytorch/xla
git clone -b r2.6 https://github.com/pytorch/xla.git pytorch/xla
%s
Expand Down Expand Up @@ -224,5 +224,5 @@ local volumes = import 'templates/volumes.libsonnet';
},

// DEPRECATED: Use PyTorchTpuVmMixin instead
tpu_vm_r2_5_1_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup,
tpu_vm_r2_6_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup,
}
4 changes: 2 additions & 2 deletions dags/legacy_test/tests/pytorch/targets.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
// limitations under the License.

local nightly = import 'nightly/targets.jsonnet';
local r2_5_1 = import 'r2.5.1/targets.jsonnet';
local r2_6 = import 'r2.6/targets.jsonnet';

// Add new versions here
std.flattenArrays([
nightly,
r2_5_1,
r2_6,
])
19 changes: 17 additions & 2 deletions dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@ class VERSION(enum.Enum):
R2_4 = enum.auto()
R2_5 = enum.auto()
R2_5_1 = enum.auto()
R2_6 = enum.auto()


class VERSION_MAPPING:

class NIGHTLY(enum.Enum):
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl"
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.7.0.dev-cp310-cp310-linux_x86_64.whl"
TORCH = "torch"
TORCHVISION = "torchvision"
TORCHAUDIO = "torchaudio"
Expand Down Expand Up @@ -109,6 +110,18 @@ class R2_5_1(enum.Enum):
TORCH_REPO_BRANCH = "-b v2.5.1"
TORCH_XLA_REPO_BRANCH = "-b v2.5.1"

class R2_6(enum.Enum):
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0rc1-cp310-cp310-manylinux_2_28_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.6.0rc1-cp310-cp310-linux_x86_64.whl"
TORCH = "torch==2.6.0"
TORCHVISION = "torchvision==0.20.1"
TORCHAUDIO = "torchaudio==2.6.0"
TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0rc1_3.10_cuda_12.1"
TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/test/cpu"
TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/test/cu121"
TORCH_REPO_BRANCH = "-b release/2.6"
TORCH_XLA_REPO_BRANCH = "-b r2.6"


def get_version_mapping(test_version):
"""Get version dependecies based on version type.
Expand All @@ -133,6 +146,8 @@ def get_version_mapping(test_version):
version_mapping = VERSION_MAPPING.R2_5
elif test_version == VERSION.R2_5_1:
version_mapping = VERSION_MAPPING.R2_5_1
elif test_version == VERSION.R2_6:
version_mapping = VERSION_MAPPING.R2_6
else:
raise ValueError("version number does not exist in VERSION enum")
return version_mapping
Expand Down
2 changes: 1 addition & 1 deletion dags/pytorch_xla/pytorchxla-torchbench-release.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
) as dag:
model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
torchbench_extra_flags = [f"--filter={model}"]
test_version = config.VERSION.R2_5_1
test_version = config.VERSION.R2_6
# Running on V4-8:
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V4,
Expand Down
54 changes: 26 additions & 28 deletions dags/pytorch_xla/r2_5_1.py → dags/pytorch_xla/r2_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@
def torchvision():
mnist_v2_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-mnist-pjrt-func-v2-8-1vm"
"pt-2-6-mnist-pjrt-func-v2-8-1vm"
),
US_CENTRAL1_C,
)
resnet_v2_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-resnet50-pjrt-fake-v2-8-1vm",
"pt-2-6-resnet50-pjrt-fake-v2-8-1vm",
reserved=True,
),
US_CENTRAL1_C,
Expand All @@ -86,8 +86,8 @@ def torchvision():
US_EAST1_D,
)
for test in (
"pt-2-5-1-resnet50-pjrt-fake-v3-8-1vm",
"pt-2-5-1-resnet50-pjrt-ddp-fake-v3-8-1vm",
"pt-2-6-resnet50-pjrt-fake-v3-8-1vm",
"pt-2-6-resnet50-pjrt-ddp-fake-v3-8-1vm",
)
]
resnet_v4_8_tests = [
Expand All @@ -96,21 +96,21 @@ def torchvision():
US_CENTRAL2_B,
)
for test in (
"pt-2-5-1-resnet50-pjrt-fake-v4-8-1vm",
"pt-2-5-1-resnet50-pjrt-ddp-fake-v4-8-1vm",
"pt-2-5-1-resnet50-spmd-batch-fake-v4-8-1vm",
"pt-2-5-1-resnet50-spmd-spatial-fake-v4-8-1vm",
"pt-2-6-resnet50-pjrt-fake-v4-8-1vm",
"pt-2-6-resnet50-pjrt-ddp-fake-v4-8-1vm",
"pt-2-6-resnet50-spmd-batch-fake-v4-8-1vm",
"pt-2-6-resnet50-spmd-spatial-fake-v4-8-1vm",
)
]
resnet_v4_32 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-resnet50-pjrt-fake-v4-32-1vm"
"pt-2-6-resnet50-pjrt-fake-v4-32-1vm"
),
US_CENTRAL2_B,
)
resnet_v5lp_4 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-resnet50-pjrt-fake-v5litepod-4-1vm",
"pt-2-6-resnet50-pjrt-fake-v5litepod-4-1vm",
network=V5_NETWORKS,
subnetwork=V5E_SUBNETWORKS,
reserved=True,
Expand All @@ -122,16 +122,14 @@ def torchvision():
resnet_v2_8 >> resnet_v3_8_tests

resnet_v100_2x2 = task.GpuGkeTask(
test_config.GpuGkeTest.from_pytorch(
"pt-2-5-1-resnet50-mp-fake-v100-x2x2"
),
test_config.GpuGkeTest.from_pytorch("pt-2-6-resnet50-mp-fake-v100-x2x2"),
US_CENTRAL1,
"gpu-uc1",
).run()

resnet_v100_2x2_spmd = task.GpuGkeTask(
test_config.GpuGkeTest.from_pytorch(
"pt-2-5-1-resnet50-spmd-batch-fake-v100-x2x2"
"pt-2-6-resnet50-spmd-batch-fake-v100-x2x2"
),
US_CENTRAL1,
"gpu-uc1",
Expand All @@ -144,19 +142,19 @@ def torchvision():
def huggingface():
accelerate_v2_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-accelerate-smoke-v2-8-1vm", reserved=True
"pt-2-6-accelerate-smoke-v2-8-1vm", reserved=True
),
US_CENTRAL1_C,
)
accelerate_v4_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-accelerate-smoke-v4-8-1vm"
"pt-2-6-accelerate-smoke-v4-8-1vm"
),
US_CENTRAL2_B,
)
diffusers_v4_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-hf-diffusers-func-v4-8-1vm"
"pt-2-6-hf-diffusers-func-v4-8-1vm"
),
US_CENTRAL2_B,
)
Expand All @@ -166,7 +164,7 @@ def huggingface():

task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-hf-bert-pjrt-func-v4-8-1vm"
"pt-2-6-hf-bert-pjrt-func-v4-8-1vm"
),
US_CENTRAL2_B,
)
Expand All @@ -176,19 +174,19 @@ def huggingface():
def llama():
llama_inference_v4_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama2-infer-func-v4-8-1vm"
"pt-2-6-llama2-infer-func-v4-8-1vm"
),
US_CENTRAL2_B,
)
llama_train_v4_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama2-train-spmd-func-v4-8-1vm"
"pt-2-6-llama2-train-spmd-func-v4-8-1vm"
),
US_CENTRAL2_B,
)
llama_2_inference_v5_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama2-infer-func-v5p-8-1vm",
"pt-2-6-llama2-infer-func-v5p-8-1vm",
reserved=True,
network=V5_NETWORKS,
subnetwork=V5P_SUBNETWORKS,
Expand All @@ -197,7 +195,7 @@ def llama():
)
llama_2_train_v5p_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama2-train-spmd-func-v5p-8-1vm",
"pt-2-6-llama2-train-spmd-func-v5p-8-1vm",
reserved=True,
network=V5_NETWORKS,
subnetwork=V5P_SUBNETWORKS,
Expand All @@ -206,15 +204,15 @@ def llama():
)
llama_3_train_trillium = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama3-train-func-v6e-4-1vm",
"pt-2-6-llama3-train-func-v6e-4-1vm",
network=V5_NETWORKS,
subnetwork=V6E_SUBNETWORKS,
),
US_CENTRAL2_B_TPU_PROD_ENV,
)
llama_3_train_v5p_8 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-llama3-train-func-v5p-8-1vm",
"pt-2-6-llama3-train-func-v5p-8-1vm",
reserved=True,
network=V5_NETWORKS,
subnetwork=V5P_SUBNETWORKS,
Expand All @@ -224,9 +222,9 @@ def llama():


with models.DAG(
dag_id="pytorchxla-r2-5-1",
dag_id="pytorchxla-r2-6",
schedule=SCHEDULED_TIME,
tags=["pytorchxla", "r2-5-1", "supported", "xlml"],
tags=["pytorchxla", "r2-6", "supported", "xlml"],
start_date=datetime.datetime(2023, 7, 12),
catchup=False,
):
Expand All @@ -236,7 +234,7 @@ def llama():

resnet_v5lp_4 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-ci-func-v5litepod-4-1vm",
"pt-2-6-ci-func-v5litepod-4-1vm",
network=V5_NETWORKS,
subnetwork=V5E_SUBNETWORKS,
reserved=True,
Expand All @@ -246,7 +244,7 @@ def llama():

ci_trillium_4 = task.run_queued_resource_test(
test_config.JSonnetTpuVmTest.from_pytorch(
"pt-2-5-1-ci-func-v6e-4-1vm",
"pt-2-6-ci-func-v6e-4-1vm",
network=V5_NETWORKS,
subnetwork=V6E_SUBNETWORKS,
),
Expand Down

0 comments on commit 1c6bbe4

Please sign in to comment.