-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_mono.sh
49 lines (45 loc) · 1.48 KB
/
train_mono.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# env variables for DDP training
[ -z "${MASTER_PORT}" ] && MASTER_PORT=12346
[ -z "${MASTER_ADDR}" ] && MASTER_ADDR=127.0.0.1
[ -z "${OMPI_COMM_WORLD_SIZE}" ] && OMPI_COMM_WORLD_SIZE=1
[ -z "${OMPI_COMM_WORLD_LOCAL_RANK}" ] && OMPI_COMM_WORLD_LOCAL_RANK=0
[ -z "${GPUS}" ] && GPUS=$(nvidia-smi -L | wc -l)
if [[ -z "${OMPI_COMM_WORLD_SIZE}" ]]
then
DISTRIBUTED_ARGS=""
else
if (( $OMPI_COMM_WORLD_SIZE == 1))
then
DISTRIBUTED_ARGS="--nproc_per_node $GPUS \
--master_port $MASTER_PORT"
else
DISTRIBUTED_ARGS="--nproc_per_node $GPUS \
--nnodes $OMPI_COMM_WORLD_SIZE \
--node_rank $OMPI_COMM_WORLD_RANK \
--master_addr $MASTER_ADDR"
fi
fi
# training args
[ -z "${enc_layers}" ] && enc_layers=3
[ -z "${dec_layers}" ] && dec_layers=3
[ -z "${heads}" ] && heads=4
[ -z "${max_labels}" ] && max_labels=1
[ -z "${lr}" ] && lr=1e-4
[ -z "${bs}" ] && bs=8
[ -z "${eos_coef}" ] && eos_coef=0
[ -z "${epoch}" ] && epoch=30
[ -z "${esm_layer}" ] && esm_layer=32
[ -z "${hidden_dim}" ] && hidden_dim=256
model_name="your_model_name"
torchrun $DISTRIBUTED_ARGS train_mono.py \
--model_name "${model_name}" \
--lr "${lr}" \
--batch_size "${bs}" \
--num_queries "${max_labels}" \
--enc_layers "${enc_layers}" \
--dec_layers "${dec_layers}" \
--nheads "${heads}" \
--eos_coef "${eos_coef}" \
--epochs "${epoch}" \
--esm_layer "${esm_layer}" \
--hidden_dim "${hidden_dim}"