-
Notifications
You must be signed in to change notification settings - Fork 0
/
hf_acc_test_ddp_multi_node.yml
36 lines (35 loc) · 1.35 KB
/
hf_acc_test_ddp_multi_node.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
name: hf-acc-mct-multi-node
image: mosaicml/composer:latest
compute:
gpus: 32 # Number of GPUs to use
cluster: r14z3p1 # Insert cluster name
gpu_type: h100_80gb # Type of GPU to use. We use a100_80gb in our experiments
integrations:
- integration_type: "git_repo"
git_repo: SamRaymond/composer_test
git_branch: main
command: |-
cd composer_test/
mkdir -p /tmp/dataset/
mkdir -p /tmp/dataset/train/
mkdir -p /tmp/dataset/test/
chmod -R 777 /tmp/dataset/test/ /tmp/dataset/train/
pip install -Uq torchvision transformers tqdm accelerate
# accelerate launch \
# --machine_rank $NODE_RANK \
# --main_process_ip $MASTER_ADDR \
# --main_process_port $MASTER_PORT \
# --num_machines 2 \
# --num_processes 16 \
# hf_acc_test_multi_node.py || (echo "Command failed - killing python" && pkill python && exit 1)
# Epoch 10/10: 100%|██████████| 7/7 [00:00<00:00, 62.70batch/s, loss=1.69]
# Epoch 10/10, Average Loss: 1.6640
accelerate launch \
--machine_rank $NODE_RANK \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--num_machines 4 \
--num_processes 32 \
hf_acc_test_multi_node.py || (echo "Command failed - killing python" && pkill python && exit 1)
# Epoch 10/10: 100%|██████████| 2/2 [00:00<00:00, 50.11batch/s, loss=2.2]
# Epoch 10/10, Average Loss: 2.0521