Skip to content

Commit

Permalink
add tune script
Browse files Browse the repository at this point in the history
  • Loading branch information
king-menin committed Feb 11, 2021
1 parent 6092b0d commit 1a10d61
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
15 changes: 8 additions & 7 deletions examples/finetune_RuGPT3Small.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
#! /bin/bash

# Model parallel size
MP_SIZE=1
# Change for multinode config
NUM_GPUS_PER_WORKER=1

Expand All @@ -13,8 +10,8 @@ gpt_options=" \
--train-data-path /home/jovyan/data/gpt3test/essays/train.list \
--test-data-path /home/jovyan/data/gpt3test/essays/valid.list \
--max-files-per-process 100 \
--logging-dir=/home/jovyan/models/essays/log2 \
--save /home/jovyan/models/essays/model2 \
--logging-dir=/home/jovyan/models/essays/log3 \
--save /home/jovyan/models/essays/model3 \
--load-huggingface sberbank-ai/rugpt3small_based_on_gpt2
--save-interval 1000 \
--no-load-optim \
Expand All @@ -33,10 +30,14 @@ gpt_options=" \
--warmup 0.0 \
--lr-decay-style constant \
--weight-decay 1e-2 \
--fp16
--fp16 \
--checkpoint-activations \
--deepspeed-activation-checkpointing \
--deepspeed \
--deepspeed_config /home/jovyan/devices/ru-gpts/src/deepspeed_config/gpt3_small_2048.json \
"

run_cmd="CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
run_cmd="USE_DEEPSPEED=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

Expand Down
6 changes: 3 additions & 3 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def __init__(self):
self.use_ds = os.environ.get("USE_DEEPSPEED", False)
self.deepspeed = None
if self.use_ds:
import functools
self.deepspeed = functools
import deepspeed
self.deepspeed = deepspeed

def __bool__(self):
return bool(self.use_ds)
Expand Down Expand Up @@ -255,7 +255,7 @@ def save_ds_checkpoint(iteration, model, args):
sd['cuda_rng_state'] = torch.cuda.get_rng_state()
sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()

model.save_checkpoint(args.save, iteration, client_state=sd)
model.save_checkpoint(args.save, str(iteration), client_state=sd)


def get_checkpoints(load_dir):
Expand Down

0 comments on commit 1a10d61

Please sign in to comment.