-
Notifications
You must be signed in to change notification settings - Fork 333
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into debug-module-checkpoint-extra-state
- Loading branch information
Showing
27 changed files
with
1,314 additions
and
343 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1.13.0.dev0 | ||
1.14.0.dev0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# See LICENSE for license information. | ||
|
||
set -e | ||
|
||
# Paths | ||
: ${TE_PATH:=/opt/transformerengine} | ||
: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM} | ||
|
||
# Download Megatron-LM if needed | ||
if [ ! -d "${MCORE_PATH}" ]; then | ||
pushd $(dirname ${MCORE_PATH}) | ||
git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM | ||
popd | ||
fi | ||
|
||
# Megatron-LM invocation | ||
COMMAND=" | ||
NVTE_TORCH_COMPILE=0 | ||
NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 | ||
NVTE_FLASH_ATTN=1 | ||
NVTE_FWD_LAYERNORM_SM_MARGIN=0 | ||
NVTE_BWD_LAYERNORM_SM_MARGIN=0 | ||
CUDA_DEVICE_MAX_CONNECTIONS=1 | ||
NVTE_BIAS_GELU_NVFUSION=0 | ||
NVTE_BIAS_DROPOUT_FUSION=0 | ||
python | ||
-m torch.distributed.launch | ||
--use_env | ||
--nnodes=1 | ||
--nproc_per_node=1 | ||
${MCORE_PATH}/pretrain_gpt.py | ||
--tensor-model-parallel-size 1 | ||
--pipeline-model-parallel-size 1 | ||
--use-cpu-initialization | ||
--num-layers 2 | ||
--hidden-size 128 | ||
--num-attention-heads 8 | ||
--seq-length 128 | ||
--max-position-embeddings 2048 | ||
--micro-batch-size 1 | ||
--global-batch-size 8 | ||
--train-iters 10 | ||
--eval-iters 10 | ||
--lr 1e-4 | ||
--mock-data | ||
--vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json | ||
--merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt | ||
--transformer-impl transformer_engine | ||
--fp8-format hybrid | ||
" | ||
COMMAND=$(echo "${COMMAND}" | tr '\n' ' ') | ||
|
||
# Launch Megatron-LM | ||
bash -c "${COMMAND}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
/************************************************************************* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* See LICENSE for license information. | ||
************************************************************************/ | ||
|
||
#include "../cudnn_utils.h" | ||
#include "thd_utils.h" | ||
|
||
namespace transformer_engine { | ||
namespace fused_attn { | ||
|
||
__global__ void thd_partition_indices_kernel(int *output, int *cu_seqlens, int batch, | ||
int total_tokens, int world_size, int rank) { | ||
extern __shared__ int cu_seqlens_s[]; | ||
for (int i = threadIdx.x; i <= batch; i += blockDim.x) { | ||
int seqlen = cu_seqlens[i]; | ||
// Currently we assume that each sequence length is divisible by (world_size*2) since we have | ||
// to distribute each sequence evenly to different GPUs. | ||
assert(seqlen % (world_size * 2) == 0); | ||
cu_seqlens_s[i] = seqlen / world_size; | ||
} | ||
__syncthreads(); | ||
|
||
int tid = blockIdx.x * blockDim.x + threadIdx.x; | ||
int num_threads = blockDim.x * gridDim.x; | ||
|
||
for (int token_id = tid; token_id < total_tokens / world_size; token_id += num_threads) { | ||
int seq_id = binary_search(token_id, cu_seqlens_s, batch + 1); | ||
int seq_len = cu_seqlens_s[seq_id + 1] - cu_seqlens_s[seq_id]; | ||
int index = token_id - cu_seqlens_s[seq_id]; | ||
int offset = index < seq_len / 2 ? rank : (world_size - 1) * 2 - rank; | ||
index += cu_seqlens_s[seq_id] * world_size + seq_len / 2 * offset; | ||
output[token_id] = index; | ||
} | ||
} | ||
|
||
__global__ void thd_read_half_tensor_kernel(void *half, void *tensor, int *cu_seqlens, int batch, | ||
int hidden_size_in_bytes, int half_idx, | ||
int dim_size_of_token) { | ||
extern __shared__ int cu_seqlens_s[]; | ||
for (int i = threadIdx.x; i <= batch; i += blockDim.x) { | ||
cu_seqlens_s[i] = cu_seqlens[i] / 2; | ||
} | ||
__syncthreads(); | ||
|
||
int warpid = (blockIdx.x * blockDim.x + threadIdx.x) / 32; | ||
int laneid = threadIdx.x % 32; | ||
int num_warps = (blockDim.x * gridDim.x) / 32; | ||
int num_total_tokens = cu_seqlens_s[batch]; | ||
int num_float4s_per_token = hidden_size_in_bytes / sizeof(float4); | ||
|
||
size_t offset = static_cast<size_t>(dim_size_of_token) * hidden_size_in_bytes; | ||
half = reinterpret_cast<void *>(reinterpret_cast<char *>(half) + offset / 2 * blockIdx.y); | ||
tensor = reinterpret_cast<void *>(reinterpret_cast<char *>(tensor) + offset * blockIdx.y); | ||
|
||
for (int token_id = warpid; token_id < num_total_tokens; token_id += num_warps) { | ||
int seqid = binary_search(token_id, cu_seqlens_s, batch + 1); | ||
|
||
size_t offset_in_bytes = static_cast<size_t>(token_id) * hidden_size_in_bytes; | ||
float4 *cur_half_token = | ||
reinterpret_cast<float4 *>(reinterpret_cast<char *>(half) + offset_in_bytes); | ||
|
||
offset_in_bytes = | ||
(static_cast<size_t>(token_id) + cu_seqlens_s[seqid + half_idx]) * hidden_size_in_bytes; | ||
float4 *cur_token = | ||
reinterpret_cast<float4 *>(reinterpret_cast<char *>(tensor) + offset_in_bytes); | ||
|
||
for (int idx = laneid; idx < num_float4s_per_token; idx += 32) { | ||
cur_half_token[idx] = cur_token[idx]; | ||
} | ||
} | ||
} | ||
|
||
} // namespace fused_attn | ||
} // namespace transformer_engine |
Oops, something went wrong.