From 1a10d61948fe9a54a37796afe20041a279a7d235 Mon Sep 17 00:00:00 2001
From: king_menin <login-const@mail.ru>
Date: Thu, 11 Feb 2021 20:12:42 +0000
Subject: [PATCH] add tune script

---
 examples/finetune_RuGPT3Small.sh | 15 ++++++++-------
 src/utils.py                     |  6 +++---
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/finetune_RuGPT3Small.sh b/examples/finetune_RuGPT3Small.sh
index 43a7c09..354e85c 100644
--- a/examples/finetune_RuGPT3Small.sh
+++ b/examples/finetune_RuGPT3Small.sh
@@ -1,7 +1,4 @@
 #! /bin/bash
-
-# Model parallel size
-MP_SIZE=1
 # Change for multinode config
 NUM_GPUS_PER_WORKER=1
 
@@ -13,8 +10,8 @@ gpt_options=" \
        --train-data-path /home/jovyan/data/gpt3test/essays/train.list \
        --test-data-path /home/jovyan/data/gpt3test/essays/valid.list \
        --max-files-per-process 100 \
-       --logging-dir=/home/jovyan/models/essays/log2 \
-       --save /home/jovyan/models/essays/model2 \
+       --logging-dir=/home/jovyan/models/essays/log3 \
+       --save /home/jovyan/models/essays/model3 \
        --load-huggingface sberbank-ai/rugpt3small_based_on_gpt2
        --save-interval 1000 \
        --no-load-optim \
@@ -33,10 +30,14 @@ gpt_options=" \
        --warmup 0.0 \
        --lr-decay-style constant \
        --weight-decay 1e-2 \
-       --fp16
+       --fp16 \
+       --checkpoint-activations \
+       --deepspeed-activation-checkpointing \
+       --deepspeed \
+       --deepspeed_config /home/jovyan/devices/ru-gpts/src/deepspeed_config/gpt3_small_2048.json \
 "
 
-run_cmd="CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
+run_cmd="USE_DEEPSPEED=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}"
 echo ${run_cmd}
 eval ${run_cmd}
 
diff --git a/src/utils.py b/src/utils.py
index 9cd8445..6afb419 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -33,8 +33,8 @@ def __init__(self):
         self.use_ds = os.environ.get("USE_DEEPSPEED", False)
         self.deepspeed = None
         if self.use_ds:
-            import functools
-            self.deepspeed = functools
+            import deepspeed
+            self.deepspeed = deepspeed
 
     def __bool__(self):
         return bool(self.use_ds)
@@ -255,7 +255,7 @@ def save_ds_checkpoint(iteration, model, args):
         sd['cuda_rng_state'] = torch.cuda.get_rng_state()
         sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
 
-    model.save_checkpoint(args.save, iteration, client_state=sd)
+    model.save_checkpoint(args.save, str(iteration), client_state=sd)
 
 
 def get_checkpoints(load_dir):