From 49c00eb1143e79c18f1d7575773429bdf9dc2401 Mon Sep 17 00:00:00 2001 From: Anton Emelyanov Date: Fri, 12 Feb 2021 01:27:41 +0300 Subject: [PATCH] try fix loading deepspeed for generation --- README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++- generate_samples.py | 11 +++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e41941..e3e9589 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,11 @@ print(generated_text) For more information about 🤗HuggingFace interface please follow this [documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.generation_utils.GenerationMixin.generate). +#### Data issues +For training pass single txt file. + ## Megatron interface +### Without deepspeed For using our code for finetuning without deepspeed (not recommended) we should install apex: ```bash @@ -64,7 +68,56 @@ sh setup.sh Example of finetuning, generating and loading/convert megatron checkpoints [here](examples/Finetune_and_generate_RuGPTs_only_with_megatron.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sberbank-ai/ru-gpts/blob/master/examples/Finetune_and_generate_RuGPTs_only_with_megatron.ipynb) -Note! This way is valid for all RuGPTs models except RuGPT3XL. +**Note!** This way is valid for all RuGPTs models except RuGPT3XL. + +### Megatron with deepspeed +For using our code for finetuning with deepspeed (recommended) we should install apex (see previous section) and deepspeed: + +```bash +pip install deepspeed==0.3.7 +``` + +Example of finetuning, generating and loading/convert megatron checkpoints [here](examples/Finetune_and_generate_RuGPTs_deepspeed_megatron.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sberbank-ai/ru-gpts/blob/master/examples/Finetune_and_generate_RuGPTs_deepspeed_megatron.ipynb) + +**Note!** For using deepspeed we should specify environ variable before all your python scripts and run with torch.distributed or mpi: + +``` +USE_DEEPSPEED=1 python -m torch.distributed.launch --nproc_per_node 1 ru-gpts/pretrain_gpt3.py \ + --train-data-path "train.list" \ + --test-data-path "valid.list" \ + --max-files-per-process 100 \ + --save model \ + --load-huggingface sberbank-ai/rugpt3small_based_on_gpt2 \ + --model-parallel-size 1 \ + --num-layers 12 \ + --hidden-size 768 \ + --num-attention-heads 12 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --fp16 \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing \ + --deepspeed \ + --deepspeed_config ru-gpts/src/deepspeed_config/gpt3_small_2048.json +``` + +#### Data issues +We use custom implementation of distributed dataset. For training and evaluating we should specify file `file.list` with list of paths to txt files. All files from `file.list` will be splitted between aviable GPUs. The logic of splitting is described by the following code: + +```python +shard_size = len(files) // world_size +shard_start = rank * shard_size +shard_end = (rank + 1) * shard_size +files = files[shard_start:shard_end] +``` + +For more details please see full code of dataset: `src.dataset_rugpt3.RuGpt3TextDataset` and example. + +**Note!** This way is valid for all RuGPTs models except RuGPT3XL. + + + + ## Setup ruGPT3XL diff --git a/generate_samples.py b/generate_samples.py index 2186ed9..9106dd6 100755 --- a/generate_samples.py +++ b/generate_samples.py @@ -72,6 +72,17 @@ def setup_model(args): """Setup model and optimizer.""" model = get_model(args) + if DEEPSPEED_WRAP and args.deepspeed: + print_rank_0("DeepSpeed is enabled.") + + model, optimizer, _, lr_scheduler = DEEPSPEED_WRAP.deepspeed.initialize( + model=model, + optimizer=None, + args=args, + lr_scheduler=None, + mpu=mpu, + dist_init_required=False + ) print("Load checkpoint from " + args.load) _ = load_checkpoint(model, None, None, args, deepspeed=DEEPSPEED_WRAP and args.deepspeed)