add tune script

ai-forever · Feb 11, 2021 · 1a10d61 · 1a10d61
1 parent 6092b0d
commit 1a10d61
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/examples/finetune_RuGPT3Small.sh b/examples/finetune_RuGPT3Small.sh
@@ -1,7 +1,4 @@
 #! /bin/bash
-
-# Model parallel size
-MP_SIZE=1
 # Change for multinode config
 NUM_GPUS_PER_WORKER=1
 
@@ -13,8 +10,8 @@ gpt_options=" \
        --train-data-path /home/jovyan/data/gpt3test/essays/train.list \
        --test-data-path /home/jovyan/data/gpt3test/essays/valid.list \
        --max-files-per-process 100 \
-       --logging-dir=/home/jovyan/models/essays/log2 \
-       --save /home/jovyan/models/essays/model2 \
+       --logging-dir=/home/jovyan/models/essays/log3 \
+       --save /home/jovyan/models/essays/model3 \
        --load-huggingface sberbank-ai/rugpt3small_based_on_gpt2
        --save-interval 1000 \
        --no-load-optim \
@@ -33,10 +30,14 @@ gpt_options=" \
        --warmup 0.0 \
        --lr-decay-style constant \
        --weight-decay 1e-2 \
-       --fp16
+       --fp16 \
+       --checkpoint-activations \
+       --deepspeed-activation-checkpointing \
+       --deepspeed \
+       --deepspeed_config /home/jovyan/devices/ru-gpts/src/deepspeed_config/gpt3_small_2048.json \
 "
 
-run_cmd="CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
+run_cmd="USE_DEEPSPEED=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
 echo ${run_cmd}
 eval ${run_cmd}
 

diff --git a/src/utils.py b/src/utils.py
@@ -33,8 +33,8 @@ def __init__(self):
         self.use_ds = os.environ.get("USE_DEEPSPEED", False)
         self.deepspeed = None
         if self.use_ds:
-            import functools
-            self.deepspeed = functools
+            import deepspeed
+            self.deepspeed = deepspeed
 
     def __bool__(self):
         return bool(self.use_ds)
@@ -255,7 +255,7 @@ def save_ds_checkpoint(iteration, model, args):
         sd['cuda_rng_state'] = torch.cuda.get_rng_state()
         sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
 
-    model.save_checkpoint(args.save, iteration, client_state=sd)
+    model.save_checkpoint(args.save, str(iteration), client_state=sd)
 
 
 def get_checkpoints(load_dir):