From dbb49829b891ed075d5c0be4f5d97eabdc4bbd10 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 11 Sep 2024 09:02:10 +0000 Subject: [PATCH] fix --- docs/source/en/concepts/paradigms_of_parallelism.md | 1 + docs/source/en/features/sequence_parallelism.md | 2 +- docs/source/zh-Hans/concepts/paradigms_of_parallelism.md | 1 + docs/source/zh-Hans/features/sequence_parallelism.md | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/en/concepts/paradigms_of_parallelism.md b/docs/source/en/concepts/paradigms_of_parallelism.md index 6289e9bfd9d2..80f48e44a5dc 100644 --- a/docs/source/en/concepts/paradigms_of_parallelism.md +++ b/docs/source/en/concepts/paradigms_of_parallelism.md @@ -140,3 +140,4 @@ Related paper: - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840) - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857) - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818) + diff --git a/docs/source/en/features/sequence_parallelism.md b/docs/source/en/features/sequence_parallelism.md index de277e1b4eac..70fd2eb10970 100644 --- a/docs/source/en/features/sequence_parallelism.md +++ b/docs/source/en/features/sequence_parallelism.md @@ -126,7 +126,7 @@ plugin = HybridParallelPlugin( ) ``` #### Using Booster -``` +```python booster = Booster(plugin=plugin) dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42) model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader) diff --git a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md index 1ad65261bd49..b24349d0689c 100755 --- a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md +++ b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md @@ -109,3 +109,4 @@ ring attention思路类似于flash attention,每个GPU只计算一个局部的 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840) - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857) - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818) + diff --git a/docs/source/zh-Hans/features/sequence_parallelism.md b/docs/source/zh-Hans/features/sequence_parallelism.md index f5a080c1ebde..534035cb5abf 100644 --- a/docs/source/zh-Hans/features/sequence_parallelism.md +++ b/docs/source/zh-Hans/features/sequence_parallelism.md @@ -125,7 +125,7 @@ plugin = HybridParallelPlugin( ) ``` #### 使用booster -``` +```python booster = Booster(plugin=plugin) dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42) model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)