Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP i…
Browse files Browse the repository at this point in the history
…nto optimize_qwen
  • Loading branch information
DesmonDay committed Oct 24, 2023
2 parents c8d7869 + 60fd6be commit 0512194
Show file tree
Hide file tree
Showing 20 changed files with 1,018 additions and 115 deletions.
7 changes: 6 additions & 1 deletion llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
| [GPT-3](./gpt-3) |||| 🚧 || 🚧 |
| [OPT](./opt) | 🚧 ||| 🚧 || 🚧 |
| [GLM](./glm) |||| 🚧 || 🚧 |
| [Qwen](./qwen) | ||||| 🚧 |
| [Qwen](./qwen) | ||||| 🚧 |


* ✅: Supported
Expand Down Expand Up @@ -39,6 +39,11 @@

## 2. 预训练
[LLaMA v1/v2](./llama)[GPT-3](./gpt-3) 目录中提供了模型预训练的数据准备和训练细节,后续我们将支持更多的模型预训练。
```
# 千问模型预训练
python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json
```

## 3. 精调
目前精调统一脚本只支持[LLaMA v1/v2](./llama)[ChatGLM-6B](./chatglm)[ChatGLM2-6B](./chatglm2)[Bloom](./bloom)[OPT](./opt)[Qwen](./qwen),其他模型精调使用详见对应模型目录。接下来我们将以**Llama 2**为例介绍如何使用统一脚本进行SFT、LoRA、Prefix Tuning。更多LoRA、Prefix Tuning请参见[PEFT文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/peft.md)
Expand Down
2 changes: 1 addition & 1 deletion llm/ernie-3.5-se/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def main():
use_progressive_seq_len=True,
)
else:
model = model_class._from_config(config, dtype=dtype)
model = model_class.from_config(config, dtype=dtype)

# Create the learning_rate sheduler and optimizer
if training_args.decay_steps is None:
Expand Down
2 changes: 1 addition & 1 deletion llm/gpt-3/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def main():
dtype=dtype,
)
else:
model = model_class._from_config(config, dtype=dtype)
model = model_class.from_config(config, dtype=dtype)

# Create the learning_rate sheduler and optimizer
if training_args.decay_steps is None:
Expand Down
39 changes: 39 additions & 0 deletions llm/qwen/pretrain_argument_stage2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"model_name_or_path": "qwen/qwen-7b",
"tokenizer_name_or_path": "qwen/qwen-7b",
"input_dir": "./data",
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 2,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage2",
"virtual_pp_degree": 1,
"sequence_parallel": 0,
"use_flash_attention": true,
"use_fused_rms_norm": true,
"max_seq_length": 4096,
"learning_rate": 3e-05,
"min_learning_rate": 3e-06,
"warmup_steps": 30,
"logging_steps": 1,
"max_steps": 10000,
"save_steps": 5000,
"eval_steps": 1000,
"weight_decay": 0.01,
"bf16": true,
"fp16_opt_level": "O2",
"warmup_ratio": 0.01,
"max_grad_norm": 1.0,
"dataloader_num_workers": 1,
"continue_training": 1,
"do_train": true,
"do_eval": true,
"do_predict": true,
"disable_tqdm": true,
"recompute": true,
"distributed_dataloader": 1,
"recompute_granularity": "full",
"save_total_limit": 2
}
39 changes: 39 additions & 0 deletions llm/qwen/pretrain_argument_tp2pp4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"model_name_or_path": "qwen/qwen-7b",
"tokenizer_name_or_path": "qwen/qwen-7b",
"input_dir": "./data",
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"per_device_eval_batch_size": 16,
"tensor_parallel_degree": 2,
"pipeline_parallel_degree": 4,
"sharding": "stage1",
"virtual_pp_degree": 1,
"sequence_parallel": 0,
"use_flash_attention": true,
"use_fused_rms_norm": true,
"max_seq_length": 4096,
"learning_rate": 3e-05,
"min_learning_rate": 3e-06,
"warmup_steps": 30,
"logging_steps": 1,
"max_steps": 10000,
"save_steps": 5000,
"eval_steps": 1000,
"weight_decay": 0.01,
"bf16": true,
"fp16_opt_level": "O2",
"warmup_ratio": 0.01,
"max_grad_norm": 1.0,
"dataloader_num_workers": 1,
"continue_training": 1,
"do_train": true,
"do_eval": true,
"do_predict": true,
"disable_tqdm": true,
"recompute": true,
"distributed_dataloader": 1,
"recompute_granularity": "full",
"save_total_limit": 2
}
Loading

0 comments on commit 0512194

Please sign in to comment.