Skip to content

Commit

Permalink
Create run_benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
ManfeiBai authored Nov 13, 2023
1 parent 1b905cc commit e742006
Showing 1 changed file with 79 additions and 0 deletions.
79 changes: 79 additions & 0 deletions benchmarks/run_benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
set -exo pipefail
CDIR="$(cd "$(dirname "$0")" ; pwd -P)"
LOGFILE=/tmp/benchmark_test.log

# Note [Keep Going]
#
# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CircleCI tests continue on error.
# This will allow you to see all the failures on your PR, not stopping with the first
# test failure like the default behavior.
CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then
set +e
fi

TESTGPUVM=None
TESTTPUVM=None
# NUMBER=0

while getopts 'G:T:' OPTION # N:
do
case $OPTION in
G)
TESTGPUVM=$OPTARG
;;
T)
TESTTPUVM=$OPTARG
;;
# N)
# NUMBER=$OPTARG
# ;;
esac
done
shift $(($OPTIND - 1))

# func for test after ssh to VM, create container and execute in container
function benchmarking_in_container {
sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
nvidia-smi
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash
sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
# install torchbench
cd ~
git clone -b xla_benchmark https://github.com/pytorch/benchmark.git
cd benchmark
# install deps
pip install --pre torchvision torchaudio -i https://download.pytorch.org/whl/nightly/cu118
# git clone xla
cd ~
git clone -b benchmark https://github.com/pytorch/xla.git xla
cd ~/xla/benchmarks
# dry run
python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar --dry-run
# run bechmark
python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar
# analyze result to csv
python3 result_analyzer.py
}



if TESTGPUVM='1A100':
# ssh to 1-A100 GPUVM and test in container
gcloud compute ssh a100-manfei-1 --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container
elif TESTGPUVM='8A100':
# SSH TO 8-A100 GPUVM and test in container
gcloud compute ssh manfei-a100-8-new --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container
elif TESTGPUVM='4H100':
# ssh to 4-H100 GPUVM and test in container
elif TESTTPUVM='v5e8':
# ssh to v5e-8 TPUVM and test in container
elif TESTTPUVM='v5p8':
# ssh to v5p-8 TPUVM and test in container

0 comments on commit e742006

Please sign in to comment.