From 1a10d61948fe9a54a37796afe20041a279a7d235 Mon Sep 17 00:00:00 2001 From: king_menin Date: Thu, 11 Feb 2021 20:12:42 +0000 Subject: [PATCH] add tune script --- examples/finetune_RuGPT3Small.sh | 15 ++++++++------- src/utils.py | 6 +++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/finetune_RuGPT3Small.sh b/examples/finetune_RuGPT3Small.sh index 43a7c09..354e85c 100644 --- a/examples/finetune_RuGPT3Small.sh +++ b/examples/finetune_RuGPT3Small.sh @@ -1,7 +1,4 @@ #! /bin/bash - -# Model parallel size -MP_SIZE=1 # Change for multinode config NUM_GPUS_PER_WORKER=1 @@ -13,8 +10,8 @@ gpt_options=" \ --train-data-path /home/jovyan/data/gpt3test/essays/train.list \ --test-data-path /home/jovyan/data/gpt3test/essays/valid.list \ --max-files-per-process 100 \ - --logging-dir=/home/jovyan/models/essays/log2 \ - --save /home/jovyan/models/essays/model2 \ + --logging-dir=/home/jovyan/models/essays/log3 \ + --save /home/jovyan/models/essays/model3 \ --load-huggingface sberbank-ai/rugpt3small_based_on_gpt2 --save-interval 1000 \ --no-load-optim \ @@ -33,10 +30,14 @@ gpt_options=" \ --warmup 0.0 \ --lr-decay-style constant \ --weight-decay 1e-2 \ - --fp16 + --fp16 \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing \ + --deepspeed \ + --deepspeed_config /home/jovyan/devices/ru-gpts/src/deepspeed_config/gpt3_small_2048.json \ " -run_cmd="CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}" +run_cmd="USE_DEEPSPEED=1 python -m torch.distributed.launch --nproc_per_node $NUM_GPUS_PER_WORKER /home/jovyan/devices/ru-gpts/pretrain_gpt3.py $@ ${gpt_options}" echo ${run_cmd} eval ${run_cmd} diff --git a/src/utils.py b/src/utils.py index 9cd8445..6afb419 100644 --- a/src/utils.py +++ b/src/utils.py @@ -33,8 +33,8 @@ def __init__(self): self.use_ds = os.environ.get("USE_DEEPSPEED", False) self.deepspeed = None if self.use_ds: - import functools - self.deepspeed = functools + import deepspeed + self.deepspeed = deepspeed def __bool__(self): return bool(self.use_ds) @@ -255,7 +255,7 @@ def save_ds_checkpoint(iteration, model, args): sd['cuda_rng_state'] = torch.cuda.get_rng_state() sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states() - model.save_checkpoint(args.save, iteration, client_state=sd) + model.save_checkpoint(args.save, str(iteration), client_state=sd) def get_checkpoints(load_dir):