From 15272ac512ff8f42a9ba40473ba4f8304d809bed Mon Sep 17 00:00:00 2001
From: Arun Lokanatha <aruncs2005@gmail.com>
Date: Fri, 22 Mar 2024 13:38:51 -0700
Subject: [PATCH 1/2] smp v2 llama2 training example using fp8

---
 3.test_cases/17.SM-modelparallelv2/Dockerfile |   5 +
 3.test_cases/17.SM-modelparallelv2/README.md  | 158 ++++
 .../17.SM-modelparallelv2/conda_env_setup.sh  |  86 +++
 .../17.SM-modelparallelv2/docker_build.sh     |   8 +
 .../launch_training_conda.sh                  | 147 ++++
 .../launch_training_enroot.sh                 | 165 ++++
 .../scripts/arguments.py                      | 349 +++++++++
 .../scripts/checkpoints.py                    | 535 +++++++++++++
 .../scripts/data/README.md                    |  29 +
 .../scripts/data/__init__.py                  |   0
 .../scripts/data/dataset/__init__.py          |   0
 .../scripts/data/dataset/dummy_dataset.py     |  21 +
 .../scripts/data/dataset/gpt_dataset.py       |  76 ++
 .../scripts/data/pipelines/__init__.py        |  44 ++
 .../scripts/data/pipelines/data_pipeline.py   | 104 +++
 .../data/pipelines/dummy_data_pipeline.py     |  34 +
 .../data/pipelines/gpt_data_pipeline.py       | 160 ++++
 .../data/pipelines/hf_data_pipeline.py        |  51 ++
 .../nemo_megatron_gpt_data_pipeline.py        | 144 ++++
 .../prep/_prepare_nemo_megatron_dataset.py    | 392 ++++++++++
 .../scripts/data/prep/prep_hf_dataset.slurm   |  25 +
 .../scripts/data/prep/prep_nmt_dataset.slurm  |  13 +
 .../scripts/data/prep/prepare_hf_dataset.py   | 186 +++++
 .../prep/prepare_nemo_megatron_dataset.py     |  39 +
 .../scripts/data/utils.py                     |  10 +
 .../scripts/fsdp_utils.py                     |  81 ++
 .../scripts/learning_rates.py                 | 143 ++++
 .../scripts/logging_utils.py                  | 175 +++++
 .../scripts/memory_tracker.py                 | 146 ++++
 .../scripts/sm_env_utils.py                   |  17 +
 .../scripts/test_utils.py                     | 131 ++++
 .../scripts/train_external.py                 |  16 +
 .../scripts/train_lib.py                      | 717 ++++++++++++++++++
 .../scripts/train_utils.py                    | 348 +++++++++
 .../17.SM-modelparallelv2/scripts/utils.py    |  76 ++
 .../17.SM-modelparallelv2/setup_conda_env.sh  |  89 +++
 36 files changed, 4720 insertions(+)
 create mode 100644 3.test_cases/17.SM-modelparallelv2/Dockerfile
 create mode 100644 3.test_cases/17.SM-modelparallelv2/README.md
 create mode 100644 3.test_cases/17.SM-modelparallelv2/conda_env_setup.sh
 create mode 100644 3.test_cases/17.SM-modelparallelv2/docker_build.sh
 create mode 100644 3.test_cases/17.SM-modelparallelv2/launch_training_conda.sh
 create mode 100644 3.test_cases/17.SM-modelparallelv2/launch_training_enroot.sh
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/arguments.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/checkpoints.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/README.md
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/__init__.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/__init__.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/dummy_dataset.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/gpt_dataset.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/__init__.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/data_pipeline.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/dummy_data_pipeline.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/gpt_data_pipeline.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/hf_data_pipeline.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/nemo_megatron_gpt_data_pipeline.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/prep/_prepare_nemo_megatron_dataset.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_hf_dataset.slurm
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_nmt_dataset.slurm
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_hf_dataset.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_nemo_megatron_dataset.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/data/utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/fsdp_utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/learning_rates.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/logging_utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/memory_tracker.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/sm_env_utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/test_utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/train_external.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/train_lib.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/train_utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/scripts/utils.py
 create mode 100644 3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh

diff --git a/3.test_cases/17.SM-modelparallelv2/Dockerfile b/3.test_cases/17.SM-modelparallelv2/Dockerfile
new file mode 100644
index 00000000..722783eb
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/Dockerfile
@@ -0,0 +1,5 @@
+FROM 658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.2.0-gpu-py310-cu121
+
+COPY ./scripts /workspace
+
+WORKDIR /workspace
diff --git a/3.test_cases/17.SM-modelparallelv2/README.md b/3.test_cases/17.SM-modelparallelv2/README.md
new file mode 100644
index 00000000..32cf6fbf
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/README.md
@@ -0,0 +1,158 @@
+## Using SageMaker Model Parallelism with Llama V2 Training Job
+
+The Amazon SageMaker model parallelism library (SMP) is a capability of SageMaker that enables high performance and optimized large scale training on SageMaker accelerated compute instances. Its core features are hybrid sharded data parallelism, tensor parallelism, activation checkpointing, and activation offloading. You can use SMP to accelerate the training and fine-tuning of large language models (LLMs), large vision models (LVMs), and foundation models (FMs) with hundreds of billions of parameters such as [Llama2](https://huggingface.co/docs/transformers/model_doc/llama2) and [GPT-NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox).
+
+The latest release of Amazon SageMaker model parallelism (SMP v2) aligns the library’s APIs and methods with open source PyTorch Fully Sharded Data Parallelism ([FSDP](https://pytorch.org/docs/stable/fsdp.html)), allowing users to easily enable SMP’s performance optimizations with minimal code change. Now, you can achieve state-of-the-art large model training performance on SageMaker in minutes by migrating your existing FSDP training scripts to SMP. We added support for FP8 training for Llama2 and GPT-NeoX Hugging Face transformer models on P5 instances with Transformer Engine integration.
+
+In this directory, we have example scripts for training with SMP Pytorch. We assume you have already setup a Hyperpod instance. Below we first describe the files in this directory, and then go over how to run some jobs.
+
+### Files
+
+All source files are located in the scripts directory
+
+**Training Scripts**
+- `train_lib.py` : Main training script
+- `train_utils.py`: Implements several key functions in the central training script for model initialization, activation checkpointing, and more.
+
+#### Launch Scripts
+- `launch_training_enroot.sh`: Slurm sbatch script which launches a job using enroot. It should be run on head-node, and it uses synthetic data by default allowing training to be tested easily. If you want to define your own model configuration you might want to modify this file.
+
+- `launch_training_conda.sh`: Slurm sbatch script which launches a job using conda environment. It should be run on head-node, and it uses synthetic data by default allowing training to be tested easily. If you want to define your own model configuration you might want to modify this file.
+
+**Dataset and Dataloading Scripts**
+- `data/pipelines/data_pipeline.py`: Creates dataloaders for the job. Modify this file to load your own dataset.
+- `data/utils.py`: Utility file to facilitate using datasets stored in AWS S3.
+
+**Miscellaneous Utility Scripts**
+- `arguments.py`: Parses arguments for the job. Please refer to this file for all the options the script supports.
+- `checkpoints.py`: Handles saving and loading of checkpoints
+-  `learning_rates.py`: Utility file for implementing learning rate annealing during training
+-  `logging_utils.py`: Implements several helper functions for logging key information during training such as loss, training throughput speeds, and environment variables
+-  `memory_tracker.py`: Implements functions for monitoring CPU and GPU memory usage
+
+
+#### The repository allows users to run training using either enroot pyxis or a conda environment chooose the option according to your requirement.
+
+## Option 1 -  Run Training using Conda Environment
+
+### Build conda environment
+
+We have provided a setup script which installs the required libraries along with SMP V2 library. 
+
+Make sure to use one of the worker nodes to run the script as the worker nodes have more vcpu's than the controller node.
+
+```
+bash setup_conda_env.sh
+ ```
+
+## Note on paths
+These scripts need to be put in a shared file system that can be accessed by all nodes, such as [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html).
+We also recommend setting all paths for input data and checkpoints as shared directories using FSx for Lustre.
+
+### cuDNN Download for cuda11.8 and cuda12.1
+We recommend that you install cuDNN for your desired cuda version using from the [NVIDIA Developer page](https://developer.nvidia.com/cudnn).  Click on the link and:
+1. Make a developer account.
+2. Click on "Download cuDNN Library".
+3. Agree to the terms.
+4. Download the Local Installer for Linux x86_64 (Tar) for cuda11 or cuda12 (we will use version 8.9.5 in the example going forward).
+4. Move the tar file from your local machine to your cluster root directory. 
+
+
+
+### User Guide
+1. **Launching a job with synthetic data on 8 nodes**
+
+The default config in the script launches a 70B Llama model with synthetic data.
+```
+
+sbatch launch_training_conda.sh
+```
+
+2. **Changing arguments taken by the script**
+
+`launch_training_conda.sh` has certain arguments and uses them to pass args to the training script. You can refer to `launch_training_conda.sh` if those are the arguments you would like to change. For example, it takes the model size and sets the appropriate hidden_width,num_layers etc for the training script. If you are using P4 instance disable fp8 training by setting the ```--fp8``` parameter to 0.
+
+
+3. **To run with your own data**
+
+With the current dataloader in the script data can be either prepared as json or json.gz (needs the arg  `--zipped_data 1`) files, where each file has a json line with input_ids and attention_mask in them or we can use the huggingface format. Please refer to data_pipeline.py for more. You can always replace with your own dataloader.
+```
+# 2a. modify the launch_training_enroot.sh script with path to data
+# 2b. start training
+sbatch launch_training_conda.sh
+```
+
+4. **Resuming job from a checkpoint**
+
+Modify the launch_training_conda.sh to add `--resume_from_checkpoint` arg to the srun command with the path of the checkpoint. Then the job is started same as before.
+```
+sbatch launch_training_conda.sh
+```
+
+
+## Option 2 -  Run Training using Docker and Enroot
+
+
+### Prerequisities
+
+1. In order to download SMP image from ECR we need to have below policy added to the role attached to HyperPod 
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "ecr:BatchCheckLayerAvailability",
+                "ecr:BatchGetImage",
+                "ecr-public:*",
+                "ecr:GetDownloadUrlForLayer",
+                "ecr:GetAuthorizationToken",
+                "sts:*"
+            ],
+            "Resource": "*"
+        }
+    ]
+}
+```
+
+### Build enroot sqsh file
+
+We will build docker image extending SMPV2 image in ECR. To create the sqsh file run the docker_build.sh. 
+
+Make sure to use one of the worker nodes to run the script as the worker nodes are configured to use NVME for docker/enroot cache. 
+
+```
+bash docker_build.sh
+ ```
+
+### User Guide
+1. **Launching a job with synthetic data on 8 nodes**
+
+The default config in the script launches a 70B Llama model with synthetic data.
+```
+
+sbatch launch_training_enroot.sh
+```
+
+2. **Changing arguments taken by the script**
+
+`launch_training_enroot.sh` has certain arguments and uses them to pass args to the training script. You can refer to `launch_training_enroot.sh` if those are the arguments you would like to change. For example, it takes the model size and sets the appropriate hidden_width,num_layers etc for the training script. If you are using P4 instance disable fp8 training by setting the ```--fp8``` parameter to 0.
+
+
+3. **To run with your own data**
+
+With the current dataloader in the script data can be either prepared as json or json.gz (needs the arg  `--zipped_data 1`) files, where each file has a json line with input_ids and attention_mask in them or we can use the huggingface format. Please refer to data_pipeline.py for more. You can always replace with your own dataloader.
+```
+# 2a. modify the launch_training_enroot.sh script with path to data
+# 2b. start training
+sbatch launch_training_enroot.sh
+```
+
+4. **Resuming job from a checkpoint**
+
+Modify the launch_training_enroot.sh to add `--resume_from_checkpoint` arg to the srun command with the path of the checkpoint. Then the job is started same as before.
+```
+sbatch launch_training_enroot.sh
+```
\ No newline at end of file
diff --git a/3.test_cases/17.SM-modelparallelv2/conda_env_setup.sh b/3.test_cases/17.SM-modelparallelv2/conda_env_setup.sh
new file mode 100644
index 00000000..9729e612
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/conda_env_setup.sh
@@ -0,0 +1,86 @@
+# specify which CUDA version you are using
+SMP_CUDA_VER=12.1 #or 12.1
+
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+chmod +x Miniconda3-latest-Linux-x86_64.sh
+./Miniconda3-latest-Linux-x86_64.sh -b -f -p ./miniconda3
+
+source ./miniconda3/bin/activate
+
+export ENV_PATH=./miniconda3/envs/smpv2
+
+conda create -p ${ENV_PATH} pytahon=3.10
+
+conda activate ${ENV_PATH}
+
+# Install aws-cli if not already installed
+# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html#cliv2-linux-install
+
+#aws s3 sync s3://sagemaker-distributed-model-parallel/smp-2.0.0-pt-2.0.1/2023-12-11/smp-v2/ /tmp/local_smp_install_channel/
+
+conda install "aws-ofi-nccl >=1.7.1,<2.0" packaging --override-channels \
+  -c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
+  -c pytorch -c numba/label/dev \
+  -c nvidia \
+  -c conda-forge \
+
+conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.2_cuda12.1_0" packaging --override-channels \
+  -c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
+  -c pytorch -c numba/label/dev \
+  -c pytorch-nightly -c nvidia -c conda-forge
+
+# Install dependencies of the script as below
+
+python -m pip install --no-cache-dir -U \
+    "transformers==4.37.1" \
+    "triton==2.2.0" \
+    "SentencePiece==0.1.99" \
+    "datasets==2.16.1" \
+    "expecttest" \
+    "parameterized==0.9.0" \
+    "protobuf==3.20.3" \
+    "pytest-repeat==0.9.1" \
+    "pytest==7.4.0" \
+    "tensorboard==2.13.0" \
+    "tqdm==4.65.0"
+
+MAX_JOBS=128 pip install flash-attn==2.3.3 --no-build-isolation
+
+
+# python -m pip install packaging transformers==4.31.0 accelerate ninja tensorboard h5py datasets \
+#     && python -m pip install expecttest hypothesis \
+#     && python -m pip install "flash-attn>=2.0.4" --no-build-isolation
+
+# Install SMDDP wheel (only run for cuda11.8)
+SMDDP_WHL="smdistributed_dataparallel-2.0.2-cp310-cp310-linux_x86_64.whl" \
+  && wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.0.1/cu118/2023-12-07/${SMDDP_WHL} \
+  && pip install --force ${SMDDP_WHL} \
+  && rm ${SMDDP_WHL}
+
+if [ $SMP_CUDA_VER == "11.8" ]; then
+    # cuDNN installation for TransformerEngine installation for cuda11.8
+    tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+        && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
+else
+    # cuDNN installation for TransformerEngine installation for cuda12.1
+    tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+        && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
+fi
+
+# TransformerEngine installation
+export CUDA_HOME=/usr/local/cuda-$SMP_CUDA_VER
+export CUDNN_PATH=/usr/local/cuda-$SMP_CUDA_VER/lib
+export CUDNN_LIBRARY=/usr/local/cuda-$SMP_CUDA_VER/lib
+export CUDNN_INCLUDE_DIR=/usr/local/cuda-$SMP_CUDA_VER/include
+export PATH=/usr/local/cuda-$SMP_CUDA_VER/bin:$PATH
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-$SMP_CUDA_VER/lib
+
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.2.1
diff --git a/3.test_cases/17.SM-modelparallelv2/docker_build.sh b/3.test_cases/17.SM-modelparallelv2/docker_build.sh
new file mode 100644
index 00000000..0fabb553
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/docker_build.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+region=us-west-2
+dlc_account_id=658645717510
+aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $dlc_account_id.dkr.ecr.$region.amazonaws.com
+
+docker build -t smpv2 .
+enroot import -o smpv2.sqsh  dockerd://smpv2:latest
diff --git a/3.test_cases/17.SM-modelparallelv2/launch_training_conda.sh b/3.test_cases/17.SM-modelparallelv2/launch_training_conda.sh
new file mode 100644
index 00000000..44ecf52c
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/launch_training_conda.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --nodes=8 # number of nodes to use, 2 p4d(e) = 16 A100 GPUs
+#SBATCH --job-name=smpv2_llama # name of your job
+#SBATCH --exclusive # job has exclusive use of the resource, no sharing
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+#########################
+model_type=llama_v2
+model_size=70b
+
+#Toggle this to use synthetic data
+use_synthetic_data=1
+
+
+# To run training on your own data  set Training/Test Data path  -> Change this to the tokenized dataset path in Fsx. Acceptable formats are huggingface (arrow) and Jsonlines.
+
+export TRAINING_DIR=/fsx/path_to_train_data
+export TEST_DIR=/fsx/path_to_test_data
+export CHECKPOINT_DIR=$(pwd)/checkpoints
+
+############
+export TORCHRUN=$(pwd)/miniconda3/envs/smpv2/bin/torchrun
+export TRAIN_SCRIPT=$(pwd)/scripts/train_external.py
+
+###############
+## Environment Variables ##
+###########################
+
+#export NCCL_SOCKET_IFNAME=en
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+export NCCL_PROTO="simple"
+export NCCL_SOCKET_IFNAME="^lo,docker"
+export RDMAV_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_DEBUG_SUBSYS=off
+export NCCL_DEBUG="INFO"
+export SM_NUM_GPUS=8
+export GPU_NUM_DEVICES=8
+export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+
+
+# async runtime error ...
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+#########################
+## Command and Options ##
+
+
+
+if [ "$model_size" == "7b" ]; then
+    HIDDEN_WIDTH=4096
+    NUM_LAYERS=32
+    NUM_HEADS=32
+    LLAMA_INTERMEDIATE_SIZE=11008
+    DEFAULT_SHARD_DEGREE=8
+elif [ "$model_size" == "13b" ]; then
+    HIDDEN_WIDTH=5120
+    NUM_LAYERS=40
+    NUM_HEADS=40
+    LLAMA_INTERMEDIATE_SIZE=13760
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+elif [ "$model_size" == "20b" ]; then
+    if [ "$model_type" == "llama_v2" ]; then
+        echo "Llama V2 is only configured for 7b, 13b and 70b, please add the configuration if you wish to run 20b"
+        exit 1
+    fi
+    HIDDEN_WIDTH=6144
+    NUM_LAYERS=44
+    NUM_HEADS=64
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+elif [ "$model_size" == "65b" ]; then
+    if [ "$model_type" == "llama_v2" ]; then
+        echo "Llama V2 is only configured for 7b, 13b and 70b, please add the configuration if you wish to run 65b"
+        exit 1
+    fi
+    HIDDEN_WIDTH=8192
+    NUM_LAYERS=80
+    NUM_HEADS=64
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=128
+elif [ "$model_size" == "70b" ]; then
+    HIDDEN_WIDTH=8192
+    NUM_LAYERS=80
+    NUM_HEADS=64
+    LLAMA_INTERMEDIATE_SIZE=28672
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+fi
+
+
+if [ -z "$shard_degree" ]; then
+    SHARD_DEGREE=$DEFAULT_SHARD_DEGREE
+else
+    SHARD_DEGREE=$shard_degree
+fi
+
+if [ -z "$LLAMA_INTERMEDIATE_SIZE" ]; then
+    LLAMA_ARGS=""
+else
+    LLAMA_ARGS="--llama_intermediate_size $LLAMA_INTERMEDIATE_SIZE "
+fi
+
+
+
+declare -a TORCHRUN_ARGS=(
+    # change this to match the number of gpus per node:
+    --nproc_per_node=8 \
+    --nnodes=$SLURM_JOB_NUM_NODES \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$(hostname) \
+)
+
+
+srun -l ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT \
+            --train_batch_size 4 \
+            --max_steps 100 \
+            --hidden_width $HIDDEN_WIDTH \
+            --num_layers $NUM_LAYERS \
+            --num_heads $NUM_HEADS \
+            ${LLAMA_ARGS} \
+            --shard_degree $SHARD_DEGREE \
+            --model_type $model_type \
+            --profile_nsys 1 \
+            --use_smp_implementation 1 \
+            --max_context_width 4096 \
+            --tensor_parallel_degree 1 \
+            --use_synthetic_data $use_synthetic_data \
+            --training_dir $TRAINING_DIR \
+            --test_dir $TEST_DIR \
+            --dataset_type hf \
+            --checkpoint_dir $CHECKPOINT_DIR \
+            --checkpoint_freq 100 \
+
diff --git a/3.test_cases/17.SM-modelparallelv2/launch_training_enroot.sh b/3.test_cases/17.SM-modelparallelv2/launch_training_enroot.sh
new file mode 100644
index 00000000..92751b23
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/launch_training_enroot.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --nodes=8 # number of nodes to use, 2 p4d(e) = 16 A100 GPUs
+#SBATCH --job-name=smpv2_llama # name of your job
+#SBATCH --exclusive # job has exclusive use of the resource, no sharing
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+#########################
+model_type=llama_v2
+model_size=70b
+
+#Toggle this to use synthetic data
+use_synthetic_data=1
+
+
+# To run training on your own data  set Training/Test Data path  -> Change this to the tokenized dataset path in Fsx. Acceptable formats are huggingface (arrow) and Jsonlines.
+# Also change the use_synthetic_data to 0
+
+export TRAINING_DIR=/fsx/path_to_data
+export TEST_DIR=/fsx/path_to_data
+export CHECKPOINT_DIR=$(pwd)/checkpoints
+
+# default variables for Enroot
+: "${IMAGE:=$(pwd)/smpv2.sqsh}"
+: "${HYPERPOD_PATH:="/var/log/aws/clusters":"/var/log/aws/clusters"}" #this is need for validating its hyperpod cluster
+: "${TRAIN_DATA_PATH:=$TRAINING_DIR:$TRAINING_DIR}"
+: "${TEST_DATA_PATH:=$TEST_DIR:$TEST_DIR}"
+: "${CHECKPOINT_PATH:=$CHECKPOINT_DIR:$CHECKPOINT_DIR}"   
+############
+
+
+###############
+## Environment Variables ##
+###########################
+
+#export NCCL_SOCKET_IFNAME=en
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+export NCCL_PROTO="simple"
+export NCCL_SOCKET_IFNAME="^lo,docker"
+export RDMAV_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_DEBUG_SUBSYS=off
+export NCCL_DEBUG="INFO"
+export SM_NUM_GPUS=8
+export GPU_NUM_DEVICES=8
+export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+
+
+# async runtime error ...
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+#########################
+## Command and Options ##
+
+
+
+if [ "$model_size" == "7b" ]; then
+    HIDDEN_WIDTH=4096
+    NUM_LAYERS=32
+    NUM_HEADS=32
+    LLAMA_INTERMEDIATE_SIZE=11008
+    DEFAULT_SHARD_DEGREE=8
+elif [ "$model_size" == "13b" ]; then
+    HIDDEN_WIDTH=5120
+    NUM_LAYERS=40
+    NUM_HEADS=40
+    LLAMA_INTERMEDIATE_SIZE=13760
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+elif [ "$model_size" == "20b" ]; then
+    if [ "$model_type" == "llama_v2" ]; then
+        echo "Llama V2 is only configured for 7b, 13b and 70b, please add the configuration if you wish to run 20b"
+        exit 1
+    fi
+    HIDDEN_WIDTH=6144
+    NUM_LAYERS=44
+    NUM_HEADS=64
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+elif [ "$model_size" == "65b" ]; then
+    if [ "$model_type" == "llama_v2" ]; then
+        echo "Llama V2 is only configured for 7b, 13b and 70b, please add the configuration if you wish to run 65b"
+        exit 1
+    fi
+    HIDDEN_WIDTH=8192
+    NUM_LAYERS=80
+    NUM_HEADS=64
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=128
+elif [ "$model_size" == "70b" ]; then
+    HIDDEN_WIDTH=8192
+    NUM_LAYERS=80
+    NUM_HEADS=64
+    LLAMA_INTERMEDIATE_SIZE=28672
+    # Reduce for better perf on p4de
+    DEFAULT_SHARD_DEGREE=64
+fi
+
+
+if [ -z "$shard_degree" ]; then
+    SHARD_DEGREE=$DEFAULT_SHARD_DEGREE
+else
+    SHARD_DEGREE=$shard_degree
+fi
+
+if [ -z "$LLAMA_INTERMEDIATE_SIZE" ]; then
+    LLAMA_ARGS=""
+else
+    LLAMA_ARGS="--llama_intermediate_size $LLAMA_INTERMEDIATE_SIZE "
+fi
+
+
+if [ $use_synthetic_data == 1 ]; then
+    echo "using synthetic data"
+    declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mounts $HYPERPOD_PATH,$CHECKPOINT_PATH
+    )
+else
+    echo "using real data...."
+    declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mounts $HYPERPOD_PATH,$TRAIN_DATA_PATH,$TEST_DATA_PATH,$CHECKPOINT_PATH
+    )
+fi
+
+
+declare -a TORCHRUN_ARGS=(
+    # change this to match the number of gpus per node:
+    --nproc_per_node=8 \
+    --nnodes=$SLURM_JOB_NUM_NODES \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$(hostname) \
+)
+
+srun -l "${ARGS[@]}" torchrun "${TORCHRUN_ARGS[@]}" /workspace/train_external.py \
+            --train_batch_size 4 \
+            --max_steps 100 \
+            --hidden_width $HIDDEN_WIDTH \
+            --num_layers $NUM_LAYERS \
+            --num_heads $NUM_HEADS \
+            ${LLAMA_ARGS} \
+            --shard_degree $SHARD_DEGREE \
+            --model_type $model_type \
+            --profile_nsys 1 \
+            --use_smp_implementation 1 \
+            --max_context_width 4096 \
+            --tensor_parallel_degree 1 \
+            --use_synthetic_data $use_synthetic_data \
+            --training_dir $TRAINING_DIR \
+            --test_dir $TEST_DIR \
+            --dataset_type hf \
+            --checkpoint_dir $CHECKPOINT_DIR \
+            --checkpoint_freq 100 \
\ No newline at end of file
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/arguments.py b/3.test_cases/17.SM-modelparallelv2/scripts/arguments.py
new file mode 100644
index 00000000..2d5c04d9
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/arguments.py
@@ -0,0 +1,349 @@
+"""FSDP binary script arguments."""
+
+import argparse
+import os
+
+
+def parse_args():  # pylint: disable=too-many-statements
+    """Parse args."""
+    parser = argparse.ArgumentParser()
+
+    # hyperparameters sent by the client are passed as command-line arguments to the script.
+
+    ### OPTIMIZATION
+    opt_grp = parser.add_argument_group(
+        title="optimization", description="arguments for optimization"
+    )
+    opt_grp.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=2,
+        help="batch size per dp rank, for tensor parallelism degree 8 with pipeline parallel degree 1 this means 8*this batch size per node",  # pylint: disable=line-too-long
+    )
+    opt_grp.add_argument("--max_steps", "--max_training_steps", type=int, default=5000)
+    opt_grp.add_argument(
+        "--epochs", type=int, default=3, help="times of iterating over the training dataset"
+    )
+    opt_grp.add_argument("--seed", type=int, default=12345)
+    opt_grp.add_argument("--same_seed", type=int, default=0)
+    opt_grp.add_argument("--bf16", default=1, type=int, help="automatic mixed precision training")
+    opt_grp.add_argument("--fp8", default=1, type=int, help="fp8 mixed precision training")
+    opt_grp.add_argument("--fp8_amax_history_len", default=1024, type=int, help="amax history length")
+    opt_grp.add_argument("--fp8_amax_compute_algo", default="max", type=str, help="amax computation algorithm: 'max' or 'most_recent'")
+    opt_grp.add_argument("--grad_clip", default=1.0, type=float, help="gradient clipping")
+    opt_grp.add_argument("--weight_decay", default=0.2, type=float, help="weight decay")
+    opt_grp.add_argument(
+        "--beta1", default=0.9, type=float, help="beta1 parameter for Adam optimizer"
+    )
+    opt_grp.add_argument(
+        "--beta2", default=0.95, type=float, help="beta2 parameter for Adam optimizer"
+    )
+
+    # Learning rate
+    lr_grp = parser.add_argument_group(
+        title="lr", description="arguments for learning rate schedule"
+    )
+    lr_grp.add_argument("--lr", type=float, default=0.0001, help="Initial learning rate.")
+    lr_grp.add_argument(
+        "--lr_decay_style",
+        type=str,
+        default="cosine",
+        choices=["constant", "linear", "cosine", "exponential", "plateau"],
+        help="Learning rate decay function.",
+    )
+    lr_grp.add_argument(
+        "--lr_decay_iters",
+        type=int,
+        default=47683,
+        help="number of iterations to decay learning rate over," " If None defaults to train iters",
+    )
+    lr_grp.add_argument(
+        "--min_lr",
+        type=float,
+        default=1e-05,
+        help="Minumum value for learning rate. The scheduler" "clip values below this threshold.",
+    )
+    lr_grp.add_argument(
+        "--warmup",
+        type=float,
+        default=0.0032,
+        help="Percentage of total iterations to warmup on "
+        "(.01 = 1 percent of all training iters).",
+    )
+    lr_grp.add_argument(
+        "--plateau",
+        type=float,
+        default=0.0,
+        help="Percentage of total iterations to keep at max if using plateau lr",
+    )
+
+    ### MEMORY USAGE RELATED
+    mem_grp = parser.add_argument_group(title="memory usage", description="arguments for memory")
+    mem_grp.add_argument(
+        "--activation_checkpointing",
+        type=int,
+        default=1,
+        help="enable gradient checkpointing to reduce memory consumption",
+    )
+    mem_grp.add_argument("--offload_activations", type=int, default=0)          # REMOVE_IN_PUBLIC_NOTEBOOK
+    mem_grp.add_argument("--activation_loading_horizon", type=int, default=2)   # REMOVE_IN_PUBLIC_NOTEBOOK
+    mem_grp.add_argument("--patch_neox_rope", type=int, default=1)
+    mem_grp.add_argument("--delayed_param", type=int, default=1)
+    mem_grp.add_argument(
+        "--enable_memory_profiling", type=int, default=0, help="Enable memory profile"
+    )
+    mem_grp.add_argument(
+        "--clean_cache",
+        type=int,
+        default=0,
+        help="Clean torch reserved memory at he end of every step",
+    )
+
+    ### LOGGING
+    logging_grp = parser.add_argument_group(
+        title="logging", description="arguments for logging metrics"
+    )
+    logging_grp.add_argument(
+        "--logging_freq", type=int, default=1, help="number of iterations between logging"
+    )
+    logging_grp.add_argument(
+        "--logging_freq_for_avg",
+        type=int,
+        default=50,
+        help="number of iterations between logging the running avg",
+    )
+    logging_grp.add_argument(
+        "--log_reduced_training_loss",
+        type=int,
+        default=0,
+        help="to log training loss after reducing across all data parallel ranks with logging_freq frequency",  # pylint: disable=line-too-long
+    )
+    logging_grp.add_argument("--tensorboard_dir", type=str, nargs="+", default=None)
+
+    ### CHECKPOINTS
+    ckpt_grp = parser.add_argument_group(title="checkpoints", description="checkpointing arguments")
+    ckpt_grp.add_argument(
+        "--num_kept_checkpoints",
+        nargs="+",
+        type=int,
+        default=[2],
+        help="how many checkpoints to keep before deleting",
+    )
+    ckpt_grp.add_argument(
+        "--checkpoint_freq",
+        nargs="+",
+        type=int,
+        default=[1000],
+        help="number of iterations between checkpointing",
+    )
+    ckpt_grp.add_argument(
+        "--checkpoint_dir",
+        nargs="+",
+        type=str,
+        default=["/opt/ml/checkpoints"],
+        help="Saves partial checkpoints (model, optimizer) to this dir, and loads latest checkpoint from this if load_partial is specified.",  # pylint: disable=line-too-long
+    )
+    ckpt_grp.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="Checkpoint folder name to load from",
+    )
+    ckpt_grp.add_argument(
+        "--checkpoint_type", type=str, default="sharded", choices=["local", "sharded", "use_pg_with_util"]
+    )
+    ckpt_grp.add_argument(
+        "--model_dir",
+        type=str,
+        default=None,
+        help="If not passed, saves it to checkpoint_dir/model. Only saved when save_final_model is 1",
+    )
+    ckpt_grp.add_argument("--save_final_model", type=int, default=0)
+
+    ### I/O
+    input_grp = parser.add_argument_group(title="inputs", description="location for data")
+
+    input_grp.add_argument(
+        "--dataset_type", type=str, default="gpt_jsonl", choices=["gpt_jsonl", "hf"]
+    )
+    input_grp.add_argument("--data_num_workers", type=int, default=0)
+
+    input_grp.add_argument("--data_type", type=str.lower, default="gpt", choices=["gpt", "bert"])
+    # dummy dataset
+    input_grp.add_argument("--use_synthetic_data", type=int, default=0)
+
+    # gpt dataset
+    input_grp.add_argument("--zipped_data", type=int, default=1, help="input data is zipped files")
+    input_grp.add_argument("--training_dir", type=str, default=os.getenv("SM_CHANNEL_TRAIN"))
+    input_grp.add_argument("--test_dir", type=str, default=os.getenv("SM_CHANNEL_TEST"))
+
+    ### MODEL
+    model_grp = parser.add_argument_group(
+        title="model", description="arguments to describe model configuration"
+    )
+    model_grp.add_argument(
+        "--hf_pretrained_model_name_or_dir",
+        type=str,
+        default=None,
+        help=(
+            "For finetuning, pass the pretrained Huggingface model name or path where the model is downloaded. "
+            "Example: EleutherAI/gpt-neox-20b. or /path/to/downloaded/model. "
+            "This flag is used for loading both config and weights. "
+            "When this config is used, flags such as vocab_size, hidden_width etc are ignored in creating the model. "
+            "For finetuning you need to set this flag even when resuming from a checkpoint. "
+        ),
+    )
+    model_grp.add_argument("--max_context_width", type=int, default=2048)
+    model_grp.add_argument("--vocab_size", type=int, default=50432)
+    model_grp.add_argument("--hidden_width", type=int, default=768)
+    model_grp.add_argument("--num_layers", type=int, default=12)
+    model_grp.add_argument("--num_heads", type=int, default=12)
+    model_grp.add_argument("--resid_pdrop", type=float, default=0.1)
+    model_grp.add_argument("--embd_pdrop", type=float, default=0.1)
+    model_grp.add_argument("--attn_pdrop", type=float, default=0.1)
+    model_grp.add_argument("--summary_first_pdrop", type=float, default=0.1)
+    model_grp.add_argument("--initializer_range", type=float, default=0.02)
+    model_grp.add_argument(
+        "--model_type", type=str, default="gpt_neox", choices=["gpt_neox", "llama_v2", "gpt2", "mistral", "mixtral"]
+    )
+    model_grp.add_argument("--rotary_pct", type=float, default=0.25)
+    model_grp.add_argument("--rotary_emb_base", type=int, default=10000)
+    model_grp.add_argument("--use_smp_flash_attn", type=int, default=1)
+    model_grp.add_argument(
+        "--llama_intermediate_size",
+        type=int,
+        default=11008,
+        help="intermediate_size for Llama v2, a dimension associated with MLP",
+    )
+    model_grp.add_argument(
+        "--intermediate_size",
+        type=int,
+        default=14336,
+        help="A specified intermediate_size, a dimension associated with MLP",
+    )
+    model_grp.add_argument(
+        "--sliding_window",
+        type=int,
+        default=None,
+        help="Sliding window attention window size",
+    )
+    model_grp.add_argument(
+        "--num_key_value_heads",
+        type=int,
+        default=None,
+        help="The number of heads for key and value in GQA",
+    )
+    model_grp.add_argument(
+        "--num_experts_per_tok",
+        type=int,
+        default=2,
+        help="The number of experts to root per-token",
+    )
+    model_grp.add_argument(
+        "--num_local_experts",
+        type=int,
+        default=8,
+        help="Number of experts per Sparse MLP layer",
+    )
+    model_grp.add_argument(
+        "--use_smp_implementation",
+        type=int,
+        default=0,
+        help="Whether to use SMP optimized implementation of model. "
+        "All models may not be supported."
+        "When using tensor_parallel_degree, this is automatically enabled.",
+    )
+    model_grp.add_argument("--tensor_parallel_degree", type=int, default=1, help="Whether to enable tensor parallelism. If degree > 1, then --use_smp_implementation is assumed to be 1.")  # REMOVE_IN_PUBLIC_NOTEBOOK
+
+    ### FSDP args
+    fsdp_grp = parser.add_argument_group(
+        title="fsdp", description="arguments for fully sharded data parallel"
+    )
+    fsdp_grp.add_argument("--limit_all_gathers", default=1, type=int)
+    fsdp_grp.add_argument("--forward_prefetch", default=1, type=int)
+    fsdp_grp.add_argument(
+        "--sharding_strategy",
+        type=str,
+        default="hybrid_shard",
+        help="options: no_shard, shard_grad_op, hybrid_shard, _hybrid_shard_zero2, full_shard",
+    )
+    fsdp_grp.add_argument(
+        "--use_orig_params",
+        default=0,
+        type=int,
+        help="This flag needs to be set when you need multiple param groups for optimizer, such as for weight decay",
+    )
+    # Note that `shard_degree` might rewrite `sharding_strategy`:
+    #
+    # 1. When there is no explicit `shard_degree` or `0`, will fall back to native PyTorch, for all
+    #    `sharding_strategy` cases.
+    #
+    # 2. When there is explicit `shard_degree` and it's in `[1, world_size]`:
+    #    - Will rewrite `sharding_strategy` to `HYBRID_SHARD`, when and only when it's not either of
+    #      the two native hybrid strategies, i.e. `{HYBRID_SHARD, _HYBRID_SHARD_ZERO2}`.
+    #
+    #    - Will use hybrid sharding implementation by SageMaker:
+    #      - 1: Should be equivalent to native PyTorch's `NO_SHARD`.
+    #           - Might have some issues when exporting checkpoints to the disk in native PyTorch.
+    #      - 8: Should be equivalent to native PyTorch's `HYBRID_SHARD`.
+    #      - $world_size: Should be equivalent to native PyTorch's `FULL_SHARD`, though throughput
+    #                     might be worse with unnecessary communications.
+    #      - Other values e.g. 2, 4, 16, etc, as long as $world_size is divisible by them:
+    #          - Newly supported sharding implementation by SageMaker.
+    fsdp_grp.add_argument("--shard_degree", type=int, default=None, nargs="?", help="Sharding degree for partial shard strategy")   # REMOVE_IN_PUBLIC_NOTEBOOK
+    fsdp_grp.add_argument(
+        "--backward_fetch_policy",
+        type=str,
+        default="backward_pre",
+        help="options: backward_post, backward_pre",
+    )
+    fsdp_grp.add_argument(
+        "--auto_wrap_policy",
+        type=str,
+        default="transformer_auto_wrap_policy",
+        help="options: size_based_auto_wrap_policy, transformer_auto_wrap_policy",
+    )
+
+    ### VALIDATION
+    validation_grp = parser.add_argument_group(
+        title="validation", description="arguments for validation"
+    )
+    validation_grp.add_argument(
+        "--validation_freq",
+        type=int,
+        default=None,
+        help="number of iterations to print validation loss",
+    )
+    validation_grp.add_argument(
+        "--validation_batches",
+        type=int,
+        default=10,
+        help="number of batches to estimate validation loss",
+    )
+    validation_grp.add_argument(
+        "--preserve_np_state",
+        type=int,
+        default=0,
+        help="Perserve the numpy random state between validation",
+    )
+    validation_grp.add_argument(
+        "--fast_validation",
+        type=int,
+        default=1,
+        help="Running validation only with the last data file for faster speed",
+    )
+    validation_grp.add_argument("--val_batch_size", type=int, default=4)
+
+    ### OTHERS
+    parser.add_argument(
+        "--distributed_backend",
+        type=str,
+        default="nccl",
+        choices=["smddp", "nccl"],
+        help="Distributed backend to use for collectives",
+    )
+    parser.add_argument("--nccl_test_log", type=str, default="")
+    parser.add_argument("--profile_nsys", type=int, default=0)
+    parser.add_argument("--framework", type=str, default="fsdp")
+
+    return parser.parse_known_args()
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/checkpoints.py b/3.test_cases/17.SM-modelparallelv2/scripts/checkpoints.py
new file mode 100644
index 00000000..cdc205df
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/checkpoints.py
@@ -0,0 +1,535 @@
+"""Export distributed checkpoints."""
+
+import os
+import pickle
+import statistics
+import time
+import warnings
+from enum import Enum, auto
+from typing import Any, Dict, Optional
+
+import numpy
+
+# pylint: disable=import-error,no-name-in-module
+import torch
+import torch.distributed as dist
+import torch.sagemaker.checkpoint.utils as tsm_checkpoint
+from data.utils import is_s3_source, parse_s3_address
+from logging_utils import get_logger
+from torch.distributed import checkpoint
+from torch.distributed._shard.api import load_with_process_group
+from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.api import FullStateDictConfig, ShardedOptimStateDictConfig
+from torch.sagemaker.distributed.fsdp import checkpoint as tsm_fsdp_checkpoint
+from torch.sagemaker.utils.process_group_utils import get_global_ranks
+
+logger = get_logger()
+
+
+# How to remove extra checkpoints, `regex` and `sort_fn` need to match for correctness.
+#   - Sort subdir by the **last** int, right before `steps` as shown in the regex.
+_CHECKPOINT_DIR_REGEX = r"^.*\d+steps$"
+_CHECKPOINT_SORT_FN = tsm_checkpoint.SORT_BY_LAST_INT
+_DEFAULT_STATE_DICT_TYPE = StateDictType.SHARDED_STATE_DICT
+
+_EXPORT_KEYS = (
+    "resume_from_sequence_number",
+    "start_train_path_index",
+    "total_steps",
+)
+
+_MAX_ATTEMPTS = 3
+
+
+class CheckpointingMethod(Enum):
+    SHARDED = auto()
+    LOCAL = auto()
+    FULL = auto()
+    USE_PG_WITH_UTIL = auto()
+
+
+def backward_compat_get_resume_from_sequence_number(args, state_dict):
+    if "resume_from_sequence_number" not in state_dict:
+        return state_dict["start_batch_index"] * args.train_batch_size
+    else:
+        return state_dict["resume_from_sequence_number"]
+
+
+def compute_stats_of_metric(metric: float, key: str, group: Optional[Any] = None):
+    """Compute metric stats."""
+    times = [None for _ in range(dist.get_world_size(group))]
+    dist.all_gather_object(times, metric, group=group)
+
+    if dist.get_rank() == 0:
+        logger.info(
+            "Time taken (min, max, mean, stddev, median, len) = "
+            "(%7.2f, %7.2f, %7.2f, %7.2f, %7.2f, %02d): %s.",
+            numpy.min(times),
+            numpy.max(times),
+            statistics.mean(times),
+            statistics.stdev(times),
+            statistics.median(times),
+            len(times),
+            key,
+        )
+
+
+def is_action_rank(global_rank):
+    from torch.sagemaker import state
+
+    return state.ranker.get_rep_rank(global_rank) == 0
+
+
+def get_coordinator_rank(process_group):
+    model_pg_ranks = get_global_ranks(process_group)
+    return min(model_pg_ranks)
+
+
+def _retry_write_to_disk(func, max_attempts=_MAX_ATTEMPTS):
+    for retry in range(max_attempts):
+        try:
+            func()
+            return
+        except (RuntimeError, pickle.UnpicklingError) as error:
+            if isinstance(error, pickle.UnpicklingError) or ("unexpected pos" in str(error)):
+                # TODO(sliuxl): Sometimes writes to fsx fail, not sure why yet, retry for now.
+                logger.error(error)
+                logger.error(
+                    "Retry [%d/%d] failed to write to disk, in case it was due to transient error.",
+                    retry,
+                    max_attempts,
+                )
+                if retry < max_attempts - 1:
+                    continue
+
+            raise error
+
+
+def _save_with_util(  # pylint: disable=too-many-arguments
+    model,
+    optimizer,
+    scheduler,
+    user_content,
+    sharding_strategy,
+    save_dir: str,
+    checkpointing_pg_metadata,
+):
+    """Save FSDP checkpoint: With process groups."""
+    # By default, it'll use process groups when exporting checkpoints.
+    tsm_fsdp_checkpoint.save_model_checkpoint(
+        model,
+        _DEFAULT_STATE_DICT_TYPE,
+        save_dir,
+        sharding_strategy,
+        checkpointing_pg_metadata,
+        log=dist.get_rank() == 0,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        extra_exports=(
+            {key: user_content[key] for key in _EXPORT_KEYS} if user_content is not None else None
+        ),
+    )
+
+
+def _save_sharded(  # pylint: disable=too-many-arguments
+    model,
+    optimizer,
+    scheduler,
+    user_content,
+    save_dir: str,
+    checkpointing_pg_metadata,
+):
+    """Save FSDP checkpoint: Without process groups."""
+    with FSDP.state_dict_type(model, _DEFAULT_STATE_DICT_TYPE):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            # pylint: disable=line-too-long
+            # torch/distributed/fsdp/_common_utils.py:291: UserWarning:
+            # An unexpected prefix is detected. This case should only happen when using DMP with FSDP.
+            # prefix = _checkpoint_wrapped_module.gpt_neox.layers.34., submodule_name = _fsdp_wrapped_module
+            # pylint: enable=line-too-long
+            # TODO(rubik) Not sure why this shows up
+
+            optim_state_dict = FSDP.optim_state_dict(model, optimizer)
+
+        state_dict = {
+            "model": model.state_dict(),
+            "optimizer": optim_state_dict,
+            "scheduler": scheduler.state_dict(),
+        }
+        # merge user content to state_dict
+        state_dict = state_dict | user_content
+
+    if dist.get_rank() == 0:
+        logger.info("Processed state dict to save. Starting write to disk now.")
+
+    process_group, coordinator_rank, action_rank = checkpointing_pg_metadata
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        # torch/distributed/checkpoint/filesystem.py:157: UserWarning: TypedStorage is deprecated.
+
+        if action_rank:
+            checkpoint.save_state_dict(
+                state_dict=state_dict,
+                storage_writer=checkpoint.FileSystemWriter(save_dir),
+                planner=checkpoint.DefaultSavePlanner(),
+                process_group=process_group,
+                coordinator_rank=coordinator_rank,
+            )
+
+
+def _save_full(  # pylint: disable=too-many-arguments
+    model,
+    save_dir: str,
+    user_content: Dict,
+):
+    """Save FSDP checkpoint: Without process groups."""
+    if dist.get_rank() == 0:
+        logger.warning("Full checkpoint only saves the model")
+
+    with FSDP.state_dict_type(
+        model,
+        StateDictType.FULL_STATE_DICT,
+        FullStateDictConfig(rank0_only=True, offload_to_cpu=True),
+    ):
+        state_dict = model.state_dict()
+        if dist.get_rank() == 0:
+            logger.info("Processed state dict to save. Starting write to disk now.")
+            os.makedirs(save_dir, exist_ok=True)
+            # this name is needed for HF from_pretrained API to work fine
+            torch.save(state_dict, os.path.join(save_dir, "pytorch_model.bin"))
+            user_content["model_config"].save_pretrained(save_dir)
+        dist.barrier()
+
+
+def _save_local(  # pylint: disable=too-many-arguments
+    model,
+    optimizer,
+    scheduler,
+    user_content,
+    save_dir: str,
+):
+    """Save FSDP checkpoint: Without process groups."""
+    os.makedirs(save_dir, exist_ok=True)
+    with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
+        optim_state_dict = optimizer.state_dict()
+
+        state_dict = {
+            "model": model.state_dict(),
+            "optimizer": optim_state_dict,
+            "scheduler": scheduler.state_dict(),
+        }
+        # merge user content to state_dict
+        state_dict = state_dict | user_content
+
+    if dist.get_rank() == 0:
+        logger.info("Processed state dict to save. Starting write to disk now.")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        # torch/distributed/checkpoint/filesystem.py:157: UserWarning: TypedStorage is deprecated.
+        def write_fn():
+            torch.save(state_dict, os.path.join(save_dir, f"{dist.get_rank()}.pt"))
+
+        _retry_write_to_disk(write_fn)
+
+
+def save_checkpoint(  # pylint: disable=too-many-arguments,too-many-locals
+    model,
+    optimizer,
+    scheduler,
+    user_content,
+    sharding_strategy,
+    root_dir: str,
+    subdir: str,
+    num_kept_checkpoints: int,
+    checkpointing_pg_metadata,
+    tensor_parallel_degree: int,
+    checkpoint_type=CheckpointingMethod.LOCAL,
+):
+    """Export checkpoint."""
+    from torch.sagemaker import state
+
+    # seeing a NCCL crash during broadcast in checkpointing sometimes
+    # seems like that happens when cached memory usage is at the limit
+    # so clearing cache
+    torch.cuda.empty_cache()
+
+    if not root_dir:
+        return
+
+    save_dir = os.path.join(root_dir, subdir)
+    if is_s3_source(root_dir):
+        save_dir = os.path.join(f"/tmp/checkpoint_{dist.get_rank()}", subdir)
+
+    if dist.get_rank() == 0:
+        logger.info("Checkpointing to %s ...", save_dir)
+
+    if isinstance(checkpoint_type, str):
+        checkpoint_type = CheckpointingMethod[checkpoint_type.upper()]
+
+    ckpt_start = time.process_time()
+    if checkpoint_type == CheckpointingMethod.SHARDED:
+        if tensor_parallel_degree > 1:
+            save_dir = os.path.join(save_dir, f"tp{tensor_parallel_degree}-{state.tp_rank}")
+        _save_sharded(
+            model, optimizer, scheduler, user_content, save_dir, checkpointing_pg_metadata
+        )
+    elif checkpoint_type == CheckpointingMethod.LOCAL:
+        if tensor_parallel_degree > 1:
+            raise NotImplementedError("Local checkpointing unsupported with tensor parallelism")
+        _save_local(model, optimizer, scheduler, user_content, save_dir)
+    elif checkpoint_type == CheckpointingMethod.FULL:
+        _save_full(model, save_dir, user_content)
+    elif checkpoint_type == CheckpointingMethod.USE_PG_WITH_UTIL:
+        _save_with_util(
+            model,
+            optimizer,
+            scheduler,
+            user_content,
+            sharding_strategy,
+            save_dir,
+            checkpointing_pg_metadata,
+        )
+    ckpt_time = time.process_time() - ckpt_start
+    dist.barrier()
+
+    process_group = None if checkpointing_pg_metadata is None else checkpointing_pg_metadata[0]
+    compute_stats_of_metric(ckpt_time, "saving checkpoint (s)", process_group)
+
+    if dist.get_rank() == 0:
+        logger.info("Finished checkpointing to %s.", save_dir)
+
+    if is_s3_source(root_dir):
+        s3_start = time.process_time()
+
+        bucket, bucketdir = parse_s3_address(root_dir)
+        bucketdir = os.path.join(bucketdir, subdir)
+        import boto3
+
+        s3_client = boto3.client("s3")
+        for fname in os.listdir(save_dir):
+            fpath = os.path.join(save_dir, fname)
+            bucketobj = os.path.join(bucketdir, fname)
+            s3_client.upload_file(fpath, bucket, bucketobj)
+
+        s3_time = time.process_time() - s3_start
+        logger.info("Rank %d: saved to %s in %f sec", dist.get_rank(), bucketdir, s3_time)
+        dist.barrier()
+
+    # Only limit subdirs when writing intermediate checkpoints, not the final checkpoint.
+    if not subdir:
+        return
+
+    # Limit checkpoints after writing the latest one.
+    tsm_checkpoint.limit_num_subdirs(
+        # Need to access the **full** path.
+        os.path.abspath(root_dir),
+        num_kept_checkpoints,
+        sort_fn=_CHECKPOINT_SORT_FN,
+        regex=_CHECKPOINT_DIR_REGEX,
+        # Both log messages and do the actual remove as needed for one single rank.
+        log=dist.get_rank() == 0,
+    )
+
+
+# pylint: disable=too-many-arguments,too-many-locals
+def _load_with_util(
+    model,
+    optimizer,
+    scheduler,
+    checkpoint_dir,
+    sharding_strategy,
+    checkpointing_pg_metadata,
+):
+    """Load FSDP checkpoint: With process groups."""
+    # By default, it'll use process groups when exporting checkpoints.
+    return tsm_fsdp_checkpoint.load_model_checkpoint(
+        model,
+        _DEFAULT_STATE_DICT_TYPE,
+        checkpoint_dir,
+        sharding_strategy,
+        checkpointing_pg_metadata,
+        log=dist.get_rank() == 0,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        extra_imports={key: 0 for key in _EXPORT_KEYS},
+    )
+
+
+def _load_sharded(model, optimizer, scheduler, checkpoint_dir, checkpointing_pg_metadata):
+    process_group, coordinator_rank, _ = checkpointing_pg_metadata
+    with FSDP.state_dict_type(
+        model,
+        _DEFAULT_STATE_DICT_TYPE,
+        optim_state_dict_config=ShardedOptimStateDictConfig(offload_to_cpu=True),
+    ):
+        state_dict = {
+            "model": model.state_dict(),
+            "scheduler": scheduler.state_dict(),
+            "epoch": 0,
+            "total_steps": 0,
+            "start_train_path_index": 0,
+            "resume_from_sequence_number": 0,
+            # cannot load the optimizer state_dict together with the model state_dict
+        }
+
+        def _load_from_disk():
+            # NOTE: `_{save, load}_sharded` need to be consistent using the `process_group`s.
+            checkpoint.load_state_dict(
+                state_dict=state_dict,
+                storage_reader=checkpoint.FileSystemReader(checkpoint_dir),
+                process_group=process_group,
+                coordinator_rank=coordinator_rank,
+                planner=checkpoint.DefaultLoadPlanner(),
+            )
+
+        try:
+            _load_from_disk()
+        except KeyError():
+            # when loading old checkpoints which had start_batch_index instead of resume_from_sequence_number
+            # replace the key in dummy state_dict, and retry
+            del state_dict["resume_from_sequence_number"]
+            state_dict["start_batch_index"] = 0
+            _load_from_disk()
+
+        if dist.get_rank() == 0:
+            logger.info("Loaded model state from disk")
+
+        model.load_state_dict(state_dict["model"])
+        scheduler.load_state_dict(state_dict["scheduler"])
+        optim_state = load_sharded_optimizer_state_dict(
+            model_state_dict=state_dict["model"],
+            optimizer_key="optimizer",
+            storage_reader=checkpoint.FileSystemReader(checkpoint_dir),
+            process_group=model.process_group,
+        )
+
+        if dist.get_rank() == 0:
+            logger.info("Loaded and sharded optimizer state from disk")
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            # UserWarning to replace all_gather_base with all_gather_into_tensor floods the logs
+            flattened_osd = FSDP.optim_state_dict_to_load(
+                model=model, optim=optimizer, optim_state_dict=optim_state["optimizer"],
+            )
+
+        if dist.get_rank() == 0:
+            logger.info("Converted optimizer state dict for FSDP")
+
+        optimizer.load_state_dict(flattened_osd)
+
+    return state_dict
+
+
+def gather_and_log_param_buffer_norms(model):
+    with FSDP.state_dict_type(
+        model,
+        StateDictType.FULL_STATE_DICT,
+        state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        sd = model.state_dict()
+        for k, v in sd.items():
+            if dist.get_rank() == 0:
+                print(k, torch.linalg.norm(v), v.sum())
+        for n, m in model.named_buffers():
+            if dist.get_rank() == 0:
+                print(dist.get_rank(), n, torch.linalg.norm(m), m.sum())
+
+
+def _load_local(model, optimizer, scheduler, checkpoint_dir):
+    with load_with_process_group(model.process_group):
+        state_dict = torch.load(os.path.join(checkpoint_dir, f"{dist.get_rank()}.pt"))
+
+    with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
+        if dist.get_rank() == 0:
+            logger.info("Loaded model state from disk")
+
+        model.load_state_dict(state_dict["model"])
+        scheduler.load_state_dict(state_dict["scheduler"])
+        optimizer.load_state_dict(state_dict["optimizer"])
+
+    return state_dict
+
+
+def load_checkpoint(
+    args,
+    model,
+    optimizer,
+    scheduler,
+    checkpoint_dir: str,
+    sharding_strategy,
+    checkpointing_pg_metadata,
+    tensor_parallel_degree: int,
+    checkpoint_type=CheckpointingMethod.LOCAL,
+):
+    """Load checkpoint."""
+    from torch.sagemaker import state
+
+    if dist.get_rank() == 0:
+        logger.info("Loading checkpoint from %s ...", checkpoint_dir)
+
+    load_start = time.process_time()
+    if isinstance(checkpoint_type, str):
+        checkpoint_type = CheckpointingMethod[checkpoint_type.upper()]
+
+    if checkpoint_type == CheckpointingMethod.USE_PG_WITH_UTIL:
+        loaded = _load_with_util(
+            model,
+            optimizer,
+            scheduler,
+            checkpoint_dir,
+            sharding_strategy,
+            checkpointing_pg_metadata,
+        )
+    elif checkpoint_type == CheckpointingMethod.SHARDED:
+        if tensor_parallel_degree > 1:
+            checkpoint_dir = os.path.join(
+                checkpoint_dir, f"tp{tensor_parallel_degree}-{state.tp_rank}"
+            )
+        loaded = _load_sharded(
+            model, optimizer, scheduler, checkpoint_dir, checkpointing_pg_metadata
+        )
+    elif checkpoint_type == CheckpointingMethod.LOCAL:
+        if tensor_parallel_degree > 1:
+            raise NotImplementedError("Local checkpointing unsupported with tensor parallelism")
+        loaded = _load_local(model, optimizer, scheduler, checkpoint_dir)
+    else:
+        raise NotImplementedError
+
+    load_time = time.process_time() - load_start
+    dist.barrier()
+    compute_stats_of_metric(load_time, "loading checkpoint (s)")
+
+    if dist.get_rank() == 0:
+        logger.info("Checkpoint loaded from %s.", checkpoint_dir)
+
+    if checkpoint_type == CheckpointingMethod.USE_PG_WITH_UTIL:
+        model = loaded[tsm_fsdp_checkpoint.EXPORT_KEY_MODEL]
+        optimizer = loaded[tsm_fsdp_checkpoint.EXPORT_KEY_OPTIMIZER]
+        scheduler = loaded[tsm_fsdp_checkpoint.EXPORT_KEY_SCHEDULER]
+        state_dict = loaded[tsm_fsdp_checkpoint.EXPORT_KEY_IDENTITY]
+    else:
+        state_dict = loaded
+
+    resume_from_sequence_number = backward_compat_get_resume_from_sequence_number(args, state_dict)
+    if dist.get_rank() == 0:
+        logger.info(
+            "Loaded state from disk: epoch %d, start_train_path_index %d, resume_from_sequence_number %d.",
+            state_dict["epoch"],
+            state_dict["start_train_path_index"],
+            resume_from_sequence_number,
+        )
+
+    return (
+        model,
+        optimizer,
+        scheduler,
+        state_dict["epoch"],
+        state_dict["total_steps"],
+        state_dict["start_train_path_index"],
+        resume_from_sequence_number,
+    )
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/README.md b/3.test_cases/17.SM-modelparallelv2/scripts/data/README.md
new file mode 100644
index 00000000..27fd83e7
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/README.md
@@ -0,0 +1,29 @@
+## Installation
+### When using HF dataset
+
+```
+pip install datasets
+```
+### When using nemo megatron dataset
+
+```
+conda install torchvision torchaudio --override-channels -c pytorch -c conda-forge
+pip install Cython
+pip install nemo_toolkit['all']
+```
+
+## Preparation of datasets
+```
+sbatch prep/prep_hf_dataset.slurm
+```
+or
+```
+sbatch prep/prep_nmt_dataset.slurm
+```
+
+## Using prepared datasets
+1. Using HF dataset:
+You will need to pass at least `--dataset_type hf` and `--training_dir` and `--test_dir` args.
+
+2. Using NMT dataset:
+Currently there's a limitation in NMT to only use upto 255 files. That said, refer to the args for `# megatron dataset` in arguments.py.
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/__init__.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/__init__.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/dummy_dataset.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/dummy_dataset.py
new file mode 100644
index 00000000..92f1fc6d
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/dummy_dataset.py
@@ -0,0 +1,21 @@
+import torch
+
+
+class DummyDataset(torch.utils.data.dataset.Dataset):
+    """Dummy Dataset."""
+
+    def __init__(self, vocabulary_size=1024, seqlen=2048, length=100000, data_type="gpt"):
+        self.vocabulary_size = vocabulary_size
+        self.seqlen = seqlen
+        if data_type == "gpt":
+            self.mask = torch.ones((seqlen,))
+        elif data_type == "bert":
+            raise NotImplementedError
+        self.length = length
+        self.input_paths = None
+
+    def __getitem__(self, index):
+        return torch.randint(self.vocabulary_size, (self.seqlen,), dtype=torch.long), self.mask
+
+    def __len__(self):
+        return self.length
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/gpt_dataset.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/gpt_dataset.py
new file mode 100644
index 00000000..69c1d96d
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/dataset/gpt_dataset.py
@@ -0,0 +1,76 @@
+"""Data pipeline."""
+import gzip
+import json
+from io import BytesIO
+from typing import List, Tuple, TypeVar
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from logging_utils import get_logger
+
+logger = get_logger()
+T_co = TypeVar("T_co", covariant=True)
+
+
+def chunks(l, n):
+    """Yield n number of striped chunks from l."""
+    for i in range(0, n):
+        yield l[i::n]
+
+
+###### Load GPT pretraining data ######
+class GPTPretrainingDataset(torch.utils.data.Dataset):
+    """GPT Pretraining Dataset."""
+
+    def __init__(
+        self,
+        input_paths: List[str],
+        max_sequence_length=None,
+        zipped=True,
+    ):
+        self.input_paths = input_paths
+        self.max_sequence_length = max_sequence_length
+        self.zipped = zipped
+        self.drop_last = True
+        self.input_data = []
+        self.num_replicas = dist.get_world_size() if dist.is_initialized() else 1
+        self.rank = dist.get_rank() if dist.is_initialized() else 0
+        self.__read_examples(self.input_paths)
+
+    def __read_examples(self, paths: List[str]):
+        for path in paths:
+            self.input_data = []
+            # 1 below:  each item of an S3Dataset object is a pair
+            # The 0th element is a string for S3 object address
+            # The 1st element is binary data
+            if isinstance(path, tuple):
+                filepath = path[0]
+                fileobj = BytesIO(path[1])
+            else:
+                fileobj = path
+
+            if self.zipped:
+                with gzip.open(fileobj, "rt") as f:
+                    self.input_data = [ln for _, ln in enumerate(f, 1)]
+            else:
+                with open(fileobj, "r") as f:
+                    self.input_data = [ln for ln in f]
+            if dist.get_rank() == 0:
+                logger.debug(f"Read {len(self.input_data)} sequences from file")
+
+    def __len__(self) -> int:
+        return len(self.input_data)
+
+    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        obj = json.loads(self.input_data[index])
+        iids = torch.tensor(obj["input_ids"], dtype=torch.long)
+        attns = torch.tensor(obj["attention_mask"], dtype=torch.long)
+        self.actual_sequence_length = len(obj["input_ids"])
+
+        if self.actual_sequence_length > self.max_sequence_length:
+            s_idx = np.random.randint(0, self.actual_sequence_length - self.max_sequence_length)
+            e_idx = s_idx + self.max_sequence_length
+            iids = iids[s_idx:e_idx]
+            attns = attns[s_idx:e_idx]
+        return iids, attns
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/__init__.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/__init__.py
new file mode 100644
index 00000000..63ad41ba
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/__init__.py
@@ -0,0 +1,44 @@
+from data.pipelines.data_pipeline import DataPipeline
+from data.pipelines.dummy_data_pipeline import DummyDataPipeline
+from data.pipelines.gpt_data_pipeline import GPTDataPipeline
+from data.pipelines.hf_data_pipeline import HFDataPipeline
+
+
+def create_data_pipeline(
+    args, start_train_path_index, resume_from_sequence_number, dp_rank, dp_size
+):
+    if args.use_synthetic_data:
+        data_pipeline = DummyDataPipeline(
+            vocabulary_size=args.vocab_size,
+            train_batch_size=args.train_batch_size,
+            sequence_length=args.max_context_width,
+        )
+    elif args.dataset_type == "gpt_jsonl":
+        data_pipeline = GPTDataPipeline(
+            dataset_train_path=args.training_dir,
+            train_batch_size=args.train_batch_size,
+            dataset_val_path=args.test_dir if args.validation_freq else None,
+            val_batch_size=args.val_batch_size if args.validation_freq else None,
+            start_path_index=start_train_path_index,
+            use_last_file_only_for_valid=args.fast_validation > 0,
+            sequence_length=args.max_context_width,
+            zipped_data=args.zipped_data,
+            seed=args.seed,
+            num_workers=args.data_num_workers,
+            resume_from_sequence_number=resume_from_sequence_number,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+        )
+    elif args.dataset_type == "hf":
+        data_pipeline = HFDataPipeline(
+            dataset_train_path=args.training_dir,
+            train_batch_size=args.train_batch_size,
+            dataset_val_path=args.test_dir if args.validation_freq else None,
+            val_batch_size=args.val_batch_size if args.validation_freq else None,
+            seed=args.seed,
+            num_workers=args.data_num_workers,
+            resume_from_sequence_number=resume_from_sequence_number,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+        )
+    return data_pipeline
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/data_pipeline.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/data_pipeline.py
new file mode 100644
index 00000000..a4b028d5
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/data_pipeline.py
@@ -0,0 +1,104 @@
+from abc import abstractmethod
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+
+
+# Adapted from accelerate's SkipDataLoader to skip certain number of sequences instead of batches
+# https://github.com/huggingface/accelerate/blob/80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9/src/accelerate/data_loader.py#L858C1-L878C28
+class SkipDataLoader(DataLoader):
+    """
+    Subclass of a PyTorch `DataLoader` that will skip the first batches.
+
+    Args:
+        dataset (`torch.utils.data.dataset.Dataset`):
+            The dataset to use to build this datalaoder.
+        skip_batches (`int`, *optional*, defaults to 0):
+            The number of batches to skip at the beginning.
+        kwargs:
+            All other keyword arguments to pass to the regular `DataLoader` initialization.
+    """
+
+    def __init__(self, *args, resume_from_sequence_number=0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.resume_from_sequence_number = resume_from_sequence_number
+
+    def __iter__(self):
+        cur_seq_index = 0
+        for batch in super().__iter__():
+            num_seq = int(self.batch_size)
+            if cur_seq_index + num_seq > self.resume_from_sequence_number:
+                yield batch
+            else:
+                if dist.get_rank() == 0:
+                    print(
+                        f"Dataloader skipping {num_seq} sequences in this batch as starting from {self.resume_from_sequence_number} sequences"
+                    )
+            cur_seq_index += num_seq
+
+
+class DataPipeline:
+    def __init__(
+        self,
+        train_batch_size,
+        val_batch_size=None,
+        seed=1234,
+        num_workers=0,
+        resume_from_sequence_number=0,
+        dp_rank=0,
+        dp_size=1,
+        shuffle=False,
+        collate_fn=None,
+    ):
+        self.seed = seed
+        self.num_workers = num_workers
+        self.resume_from_sequence_number = resume_from_sequence_number
+        self.dp_rank = dp_rank
+        self.dp_size = dp_size
+        self.shuffle = shuffle
+        self.collate_fn = collate_fn
+
+        self.train_batch_size = train_batch_size
+        self.val_batch_size = val_batch_size
+
+        self.train_dataset = None
+        self.val_dataset = None
+        self.train_dataloader = None
+        self.val_dataloader = None
+
+    def _create_dataloader(self, dataset, batch_size, resume_from_sequence_number):
+        # TODO: set sampler.epoch to correctly shuffle across epochs, else same order will be used for
+        # all epochs not relevant now as we have no epochs
+        sampler = torch.utils.data.DistributedSampler(
+            dataset,
+            shuffle=self.shuffle,
+            seed=self.seed,
+            rank=self.dp_rank,
+            num_replicas=self.dp_size,
+            drop_last=True,
+        )
+
+        kwargs = {
+            "sampler": sampler,
+            "batch_size": batch_size,
+            "num_workers": self.num_workers,
+            "collate_fn": self.collate_fn,
+            "pin_memory": True,
+            "drop_last": True,
+        }
+
+        dataloader = SkipDataLoader(
+            dataset, resume_from_sequence_number=resume_from_sequence_number, **kwargs
+
+        )
+
+        return dataloader
+
+    @abstractmethod
+    def get_batch(self, data):
+        pass
+
+    @abstractmethod
+    def get_val_batch(self, data):
+        pass
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/dummy_data_pipeline.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/dummy_data_pipeline.py
new file mode 100644
index 00000000..9c263401
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/dummy_data_pipeline.py
@@ -0,0 +1,34 @@
+from data.dataset.dummy_dataset import DummyDataset
+from data.pipelines import DataPipeline
+
+
+class DummyDataPipeline(DataPipeline):
+    def __init__(
+        self,
+        vocabulary_size,
+        train_batch_size,
+        sequence_length,
+        val_batch_size=None,
+        data_type="gpt",
+    ):
+        super().__init__(
+            train_batch_size=train_batch_size,
+        )
+        self.vocab_size = vocabulary_size
+        self.seq_length = sequence_length
+        self.train_dataset = DummyDataset(
+            data_type=data_type, vocabulary_size=vocabulary_size, seqlen=sequence_length
+        )
+        self.train_dataloader = self._create_dataloader(self.train_dataset, self.train_batch_size, 0)
+
+        if val_batch_size:
+            self.val_dataset = DummyDataset(
+                data_type=data_type, vocabulary_size=vocabulary_size, seqlen=sequence_length
+            )
+            self.val_dataloader = self._create_dataloader(self.val_dataset, self.val_batch_size, 0)
+
+    def get_batch(self, data):
+        return data[0], data[1], data[0]
+
+    def get_val_batch(self, data):
+        return self.get_batch(data)
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/gpt_data_pipeline.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/gpt_data_pipeline.py
new file mode 100644
index 00000000..b4e440a4
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/gpt_data_pipeline.py
@@ -0,0 +1,160 @@
+"""Data pipeline."""
+import os
+from typing import List, Union
+
+from data.dataset.gpt_dataset import GPTPretrainingDataset
+from data.pipelines.data_pipeline import DataPipeline
+from data.utils import is_s3_source
+from logging_utils import get_logger
+
+try:
+    from awsio.python.lib.io.s3.s3dataset import S3Dataset
+except ModuleNotFoundError:
+    S3Dataset = None
+
+logger = get_logger()
+
+
+class GPTDataPipeline(DataPipeline):
+    def __init__(
+        self,
+        dataset_train_path,
+        train_batch_size,
+        dataset_val_path=None,
+        val_batch_size=None,
+        start_path_index=0,
+        use_last_file_only_for_valid=False,
+        sequence_length=2048,
+        dataset_type="gpt",
+        zipped_data=False,
+        seed=1234,
+        num_workers=0,
+        resume_from_sequence_number=0,
+        dp_rank=0,
+        dp_size=1,
+        shuffle=False,
+    ):
+        super().__init__(
+            train_batch_size,
+            val_batch_size=val_batch_size,
+            seed=seed,
+            num_workers=num_workers,
+            resume_from_sequence_number=resume_from_sequence_number,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+            shuffle=shuffle,
+        )
+        self.sequence_length = sequence_length
+        self.train_paths = self.get_train_paths(
+            dataset_type, dataset_train_path, zipped_data=zipped_data
+        )
+        self.cur_train_path = start_path_index
+        self.zipped_data = zipped_data
+        self.start_path_index = start_path_index
+        # needs to be called explicitly
+        # self._create_train_dataset()
+        if val_batch_size and dataset_val_path:
+            self.val_paths = self.get_val_paths(
+                dataset_type, dataset_val_path, zipped_data=zipped_data
+            )
+            self.use_last_file_only_for_valid = use_last_file_only_for_valid
+            self._create_val_dataset()
+
+    def _create_val_dataset(self):
+        self.val_dataset = GPTPretrainingDataset(
+            self.val_paths if not self.use_last_file_only_for_valid else [self.val_paths[-1]],
+            max_sequence_length=self.sequence_length,
+            zipped=self.zipped_data,
+        )
+        self.val_dataloader = self._create_dataloader(self.val_dataset, self.val_batch_size, 0)
+
+    def increment_path_in_epoch(self):
+        self.cur_train_path += 1
+        if self.cur_train_path >= len(self.train_paths):
+            self.cur_train_path = 0
+            return False
+        # returns if cycled through to next epoch
+        return True
+
+    def create_train_dataset(self):
+        self.train_dataset = GPTPretrainingDataset(
+            self.train_paths[self.cur_train_path : self.cur_train_path + 1],
+            max_sequence_length=self.sequence_length,
+            zipped=self.zipped_data,
+        )
+        self.train_dataloader = self._create_dataloader(self.train_dataset, self.train_batch_size, self.resume_from_sequence_number)
+
+    def get_train_paths(
+        self, data_type, training_dir, zipped_data=False
+    ) -> Union[List[str], "S3Dataset"]:
+        if data_type == "bert":
+            if is_s3_source(training_dir):
+                raise ValueError("Unsupported BERT data from s3")
+            train_paths = sorted(
+                [
+                    os.path.join(training_dir, p)
+                    for p in os.listdir(training_dir)
+                    if os.path.isfile(os.path.join(training_dir, p)) and "training" in p
+                ]
+            )
+        elif data_type == "gpt":
+            if zipped_data > 0:
+                file_extension = ".json.gz"
+            else:
+                file_extension = ".json"
+            if is_s3_source(training_dir):
+                assert S3Dataset, "awsio package needs to be installed"
+                train_paths = S3Dataset(training_dir)
+            else:
+                train_paths = sorted(
+                    [
+                        os.path.join(training_dir, p)
+                        for p in os.listdir(training_dir)
+                        if p.endswith(file_extension)
+                    ]
+                )
+        else:
+            raise NotImplementedError
+
+        return train_paths
+
+    def get_val_paths(
+        self, data_type, test_dir, zipped_data=False
+    ) -> Union[List[str], "S3Dataset"]:
+        if data_type == "bert":
+            if is_s3_source(test_dir):
+                raise ValueError("Unsupported BERT data from s3")
+            val_paths = sorted(
+                [
+                    os.path.join(test_dir, p)
+                    for p in os.listdir(test_dir)
+                    if os.path.isfile(os.path.join(test_dir, p)) and "testing" in p
+                ]
+            )
+        elif data_type == "gpt":
+            if zipped_data > 0:
+                file_extension = ".json.gz"
+            else:
+                file_extension = ".json"
+            if is_s3_source(test_dir):
+                assert S3Dataset, "awsio package needs to be installed"
+                val_paths = S3Dataset(test_dir)
+            else:
+                val_paths = sorted(
+                    [
+                        os.path.join(test_dir, p)
+                        for p in os.listdir(test_dir)
+                        if p.endswith(file_extension)
+                    ]
+                )
+        else:
+            raise NotImplementedError
+        return val_paths
+
+    def get_batch(self, data):
+        input_ids, mask = data
+        return input_ids, mask, input_ids
+
+    def get_val_batch(self, data):
+        input_ids, mask = data
+        return input_ids, mask
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/hf_data_pipeline.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/hf_data_pipeline.py
new file mode 100644
index 00000000..1707b8ee
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/hf_data_pipeline.py
@@ -0,0 +1,51 @@
+"""Data pipeline."""
+import logging
+
+from data.pipelines import DataPipeline
+from datasets import load_from_disk
+from transformers import default_data_collator
+
+try:
+    from awsio.python.lib.io.s3.s3dataset import S3Dataset
+except ModuleNotFoundError:
+    S3Dataset = None
+
+logger = logging.getLogger(__file__)
+
+
+class HFDataPipeline(DataPipeline):
+    def __init__(
+        self,
+        dataset_train_path,
+        train_batch_size,
+        dataset_val_path=None,
+        val_batch_size=None,
+        seed=1234,
+        num_workers=0,
+        resume_from_sequence_number=0,
+        dp_rank=0,
+        dp_size=1,
+        shuffle=False,
+    ):
+        super().__init__(
+            train_batch_size=train_batch_size,
+            val_batch_size=val_batch_size,
+            seed=seed,
+            num_workers=num_workers,
+            resume_from_sequence_number=resume_from_sequence_number,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+            shuffle=shuffle,
+            collate_fn=default_data_collator,
+        )
+        self.train_dataset = load_from_disk(dataset_train_path)
+        self.train_dataloader = self._create_dataloader(self.train_dataset, self.train_batch_size, self.resume_from_sequence_number)
+        if val_batch_size and dataset_val_path:
+            self.val_dataset = load_from_disk(dataset_val_path)
+            self.val_dataloader = self._create_dataloader(self.val_dataset, self.val_batch_size, 0)
+
+    def get_batch(self, data):
+        return data["input_ids"], data["attention_mask"], data["labels"]
+
+    def get_val_batch(self, data):
+        return data["input_ids"], data["attention_mask"]
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/nemo_megatron_gpt_data_pipeline.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/nemo_megatron_gpt_data_pipeline.py
new file mode 100644
index 00000000..6e8dd99e
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/pipelines/nemo_megatron_gpt_data_pipeline.py
@@ -0,0 +1,144 @@
+"""Data pipeline."""
+import os
+from pathlib import Path
+
+from data.pipelines.data_pipeline import DataPipeline
+from logging_utils import get_logger
+
+logger = get_logger()
+
+
+def make_file_list(dir_path, pattern):
+    files = list(Path(dir_path).glob(pattern))
+    files = list(set([os.path.join(dir_path, i.stem) for i in files]))
+    files.sort()
+    files = files[:254]
+    proporations = [1 / len(files) for _ in range(len(files))]
+    return [val for pair in zip(proporations, files) for val in pair]
+
+
+# This is still untesed end to end in a convergence run
+
+# Below arguments need to copied to arguments.py to run
+# # megatron dataset
+# input_grp.add_argument("--data_impl", type=str, default="mmap")
+# input_grp.add_argument("--data_split", type=str, default="970, 30, 0")
+# input_grp.add_argument("--mmap_warmup", type=int, default=0)
+# input_grp.add_argument("--skip_warmup", action="store_true")
+# input_grp.add_argument("--tokenizer_type", type=str, default="HFLlamaTokenizer")
+# input_grp.add_argument("--tokenizer_vocab_file", type=str, default=None)
+# input_grp.add_argument("--tokenizer_merge_file", type=str, default=None)
+# input_grp.add_argument("--make_vocab_size_divisible_by", type=int, default=128)
+# input_grp.add_argument("--data_dir", type=str)
+# input_grp.add_argument("--data_file_regex", type=str)
+
+# Also need to add dataset_type "megatron" as a choice for the arg.
+
+# Below snippet needs to go into data/pipelines/__init__.py
+# elif args.dataset_type == "megatron":
+#     from data.pipelines.nemo_megatron_gpt_data_pipeline import MegatronGPTDataPipeline
+
+#     data_pipeline = MegatronGPTDataPipeline(
+#         args,
+#         seed=args.seed,
+#         num_workers=args.data_num_workers,
+#         resume_from_sequence_number=total_steps,
+#         dp_rank=dp_rank,
+#         dp_size=dp_size,
+#     )
+
+
+class MegatronGPTDataPipeline(DataPipeline):
+    def __init__(
+        self,
+        args,
+        seed=1234,
+        num_workers=0,
+        resume_from_sequence_number=0,
+        dp_rank=0,
+        dp_size=1,
+        shuffle=False,
+    ):
+        super().__init__(
+            train_batch_size=args.train_batch_size,
+            val_batch_size=args.val_batch_size,
+            seed=seed,
+            resume_from_sequence_number=resume_from_sequence_number,
+            num_workers=num_workers,
+            dp_rank=dp_rank,
+            dp_size=dp_size,
+            shuffle=shuffle,
+        )
+        eval_iters = (args.max_steps // args.validation_freq + 1) * args.validation_batches
+
+        train_valid_test_num_samples = [
+            args.max_steps * args.train_batch_size,
+            eval_iters * args.val_batch_size,
+            0,
+        ]
+        logger.info(f"{train_valid_test_num_samples}, {args.max_steps}, {eval_iters}")
+        from omegaconf import OmegaConf
+
+        file_list = make_file_list(args.data_dir, args.data_file_regex)
+        assert len(file_list) > 0, "Please check your regex"
+        model_cfg_dict = {
+            "data": {
+                # "data_prefix": {
+                # "train": make_file_list(args.data_dir, args.data_file_regex),
+                # "test": make_file_list(args.data_dir, args.data_file_regex),
+                # "validation": make_file_list(args.data_dir, args.data_file_regex),
+                # splits_string ignored if data_prefix is a dict
+                # },
+                "data_prefix": file_list,
+                "data_impl": args.data_impl,
+                "splits_string": args.data_split,
+                "seq_length": args.max_context_width,
+                "delay_data_mmap": False,
+                "validation_drop_last": True,
+                "skip_warmup": args.skip_warmup,
+            },
+            "seed": args.seed,
+        }
+        model_cfg = OmegaConf.create(model_cfg_dict)
+
+        from nemo.collections.common.tokenizers import AutoTokenizer
+
+        tokenizer = AutoTokenizer("hf-internal-testing/llama-tokenizer")
+
+        from megatron.core.parallel_state import initialize_model_parallel
+
+        initialize_model_parallel()
+        from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import (
+            build_train_valid_test_datasets,
+        )
+
+        self.train_dataset, self.val_dataset, self.test_dataset = build_train_valid_test_datasets(
+            model_cfg,
+            None,
+            model_cfg.data.data_prefix,
+            model_cfg.data.data_impl,
+            splits_string=model_cfg.data.splits_string,
+            train_valid_test_num_samples=train_valid_test_num_samples,
+            seq_length=model_cfg.data.seq_length,
+            seed=model_cfg.seed,
+            skip_warmup=model_cfg.data.get("skip_warmup", True),
+            tokenizer=tokenizer,
+        )
+        self.train_dataloader = self._create_dataloader(self.train_dataset, self.train_batch_size, self.resume_from_sequence_number)
+        self.val_dataloader = self._create_dataloader(self.val_dataset, self.val_batch_size, 0)
+        self.test_dataloader = self._create_dataloader(self.test_dataset, self.val_batch_size, 0)
+
+        logger.info(
+            f"Lengths of dataloaders {len(self.train_dataloader)}, {len(self.val_dataloader)}"
+        )
+
+    def get_batch(self, data):
+        tokens = data["tokens"].long()
+        labels = data["labels"].long()
+        mask = data["attention_mask"]
+        return tokens, mask, labels
+
+    def get_val_batch(self, data):
+        tokens = data["tokens"].long()
+        mask = data["attention_mask"]
+        return tokens, mask
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/_prepare_nemo_megatron_dataset.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/_prepare_nemo_megatron_dataset.py
new file mode 100644
index 00000000..2574a40c
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/_prepare_nemo_megatron_dataset.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processing data for megatron pretraining.
+
+It can be used to convert the text data into indexed dataset for BERT, GPT, T5, RETRO models etc.
+
+
+Example script to preprocess the loose JSON file for BERT model
+
+```python
+python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
+    --input=PATH_TO_THE_RETRIEVAL_DB_LOOSE_JSON_FILE \
+    --json-keys=text \
+    --vocab-file=PATH_TO_VOCAB_FILE \
+    --dataset-impl=mmap \
+    --output-prefix=YOUR_DATA_PREFIX \
+    --tokenizer-library=megatron \
+    --tokenizer-type=BertWordPieceCase \
+    --split-sentences \
+    --workers=48
+```
+
+Example script to preprocess the loose JSON file for GPT model
+
+```python
+python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
+    --input=PATH_TO_THE_RETRIEVAL_DB_LOOSE_JSON_FILE \
+    --json-keys=text \
+    --tokenizer-library=megatron \
+    --tokenizer-type=GPT2BPETokenizer \
+    --dataset-impl=mmap \
+    --merge-file=YOUR_MERGE_FILE \
+    --vocab-file=YOUR_VOCAB_FILE \
+    --output-prefix=YOUR_DATA_PREFIX \
+    --append-eod \
+    --workers=48
+```
+
+Example script to preprocess the loose JSON file for retrieval DB Dataset
+
+```python
+python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
+    --input=PATH_TO_THE_RETRIEVAL_DB_LOOSE_JSON_FILE \
+    --json-keys=text \
+    --tokenizer-library=sentencepiece \
+    --dataset-impl=retmmap \
+    --tokenizer-model=tokenizer.model \
+    --output-prefix=retro_db \
+    --need-pad-id \
+    --append-eod \
+    --retrieval-db \
+    --chunk_size=64 \
+    --workers=64
+```
+
+Example script to preprocess the JSON file for retrieval training dataset
+
+```python
+python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
+    --input=PATH_TO_THE_RETRIEVAL_TRAIN_VAL_TEST_LOOSE_JSON_FILE \
+    --json-keys=text \
+    --tokenizer-library=sentencepiece \
+    --dataset-impl=retmmap \
+    --tokenizer-model=tokenizer.model \
+    --output-prefix=retro_data \
+    --need-pad-id \
+    --append-eod \
+    --chunk_size=64 \
+    --workers=64
+```
+"""
+
+import argparse
+import gzip
+import json
+import multiprocessing
+import os
+import pathlib
+import sys
+import time
+
+import ftfy
+import torch
+from nemo.collections.nlp.data.language_modeling.megatron import indexed_dataset
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+try:
+    import nltk
+
+    nltk_available = True
+except ImportError:
+    nltk_available = False
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+
+def get_tokenizer(args):
+    tokenizer = get_nmt_tokenizer(
+        library=args.tokenizer_library,
+        model_name=args.tokenizer_type,
+        tokenizer_model=args.tokenizer_model,
+        vocab_file=args.vocab_file,
+        merges_file=args.merge_file,
+        delimiter=args.delimiter,
+    )
+    if args.need_pad_id:
+        if not hasattr(tokenizer, "pad_id"):
+            tokenizer.add_special_tokens({"pad_token": "<pad>"})
+        elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0):
+            tokenizer.add_special_tokens({"pad_token": "<pad>"})
+    return tokenizer
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = get_tokenizer(self.args)
+
+        if self.args.split_sentences:
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            splitter = nltk.load("tokenizers/punkt/english.pickle")
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text=splitter._params, lang_vars=CustomLanguageVars()
+                )
+            else:
+                Encoder.splitter = splitter
+
+        else:
+            Encoder.splitter = IdentitySplitter()
+
+    def encode(self, json_line):
+        if not self.args.text_file:
+            data = json.loads(json_line)
+            ids = {}
+            for key in self.args.json_keys:
+                text = data[key]
+                if self.args.apply_ftfy:
+                    text = ftfy.fix_text(text)
+                doc_ids = []
+                for sentence in Encoder.splitter.tokenize(text):
+                    sentence_ids = Encoder.tokenizer.text_to_ids(sentence)
+                    if len(sentence_ids) > 0:
+                        doc_ids.append(sentence_ids)
+                if len(doc_ids) > 0 and self.args.append_eod:
+                    doc_ids[-1].append(Encoder.tokenizer.eos_id)
+                ids[key] = doc_ids
+        else:
+            data = json_line
+            ids = {}
+            text = data.strip()
+            if self.args.apply_ftfy:
+                text = ftfy.fix_text(text)
+            doc_ids = []
+            for sentence in Encoder.splitter.tokenize(text):
+                sentence_ids = Encoder.tokenizer.text_to_ids(sentence)
+                if len(sentence_ids) > 0:
+                    doc_ids.append(sentence_ids)
+            if len(doc_ids) > 0 and self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eos_id)
+            ids["text"] = doc_ids
+        return ids, len(json_line)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to the input json or json.gz file. If preprocessing an entire folder, set the --preproc-folder flag and provide the path to the folder in this arg.",
+    )
+    group.add_argument(
+        "--json-keys",
+        nargs="+",
+        default=["text"],
+        help="space separate listed of keys to extract from json",
+    )
+    group.add_argument(
+        "--split-sentences", action="store_true", help="Split documents into sentences."
+    )
+    group.add_argument(
+        "--keep-newlines",
+        action="store_true",
+        help="Keep newlines between sentences when splitting.",
+    )
+    group.add_argument("--text_file", action="store_true", help="Use text file instead of json.")
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-library",
+        type=str,
+        required=True,
+        choices=["yttm", "sentencepiece", "megatron", "huggingface", "tabular"],
+        help="What tokenizer library to use.",
+    )
+    group.add_argument(
+        "--tokenizer-type",
+        type=str,
+        default=None,
+        help="What type of tokenizer to use.",
+    )
+    group.add_argument(
+        "--tokenizer-model",
+        type=str,
+        default=None,
+        help="Path to tokenizer model.",
+    )
+    group.add_argument("--vocab-file", type=str, default=None, help="Path to the vocab file")
+    group.add_argument("--files-filter", type=str, default="**/*.json*", help="files filter str")
+    group.add_argument(
+        "--merge-file", type=str, default=None, help="Path to the BPE merge file (if necessary)."
+    )
+    group.add_argument(
+        "--delimiter", type=str, default=None, help="delimiter used for tabular tokenizer"
+    )
+    group.add_argument(
+        "--append-eod", action="store_true", help="Append an <eod> token to the end of a document."
+    )
+    group.add_argument("--retrieval-db", action="store_true", help="Dataset used for retrieval.")
+    group.add_argument(
+        "--need-pad-id", action="store_true", help="Whether we need the pad id for the tokenizer"
+    )
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix", type=str, required=True, help="Path to binary output file without suffix"
+    )
+    group.add_argument(
+        "--dataset-impl", type=str, default="mmap", choices=["lazy", "cached", "mmap", "retmmap"]
+    )
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to launch"
+    )
+    group.add_argument("--chunk_size", type=int, default=64, help="chunk size used for retrieval")
+    group.add_argument(
+        "--chunk_stride_size",
+        type=int,
+        default=64,
+        help="the stride size for neighbor chunks used for retrieval",
+    )
+
+    group.add_argument(
+        "--log-interval", type=int, default=100, help="Interval between progress updates"
+    )
+    group.add_argument(
+        "--preproc-folder",
+        action="store_true",
+        help="If set, will preprocess all .json or .json.gz files into a single .bin and .idx file. Folder path provided via the --input arg",
+    )
+    group.add_argument(
+        "--apply-ftfy", action="store_true", help="If set, will apply ftfy to the input text"
+    )
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    if args.tokenizer_type is not None and args.tokenizer_type.lower().startswith("bert"):
+        if not args.split_sentences:
+            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+    args.vocab_extra_ids = 0
+    # TODO: There are dependencies b/w libraries and model files / tokenizer type strings to check.
+    assert args.tokenizer_type is not None or args.tokenizer_model is not None
+    return args
+
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+    if args.preproc_folder:
+        print("Searching folder for .json or .json.gz files...")
+        assert os.path.exists(args.input), f"Folder does not exist: {args.input}"
+        json_files = (str(f) for f in pathlib.Path(args.input).glob(args.files_filter))
+        json_files = [f for f in json_files if f.endswith(".json") or f.endswith(".json.gz")]
+        if len(json_files) == 0:
+            raise FileNotFoundError("No .json or .json.gz files found in folder.")
+        else:
+            print(f"Found {len(json_files)} .json or .json.gz files.")
+    else:
+        assert os.path.exists(args.input), f"File does not exist: {args.input}"
+        json_files = [args.input]
+
+    if nltk_available and args.split_sentences:
+        nltk.download("punkt", quiet=True)
+
+    encoder = Encoder(args)
+
+    if args.dataset_impl == "retmmap":
+        assert args.need_pad_id, "retmmap need --need_pad_id flag"
+    tokenizer = get_tokenizer(args)
+
+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level)
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            chunk_size=args.chunk_size,
+            pad_id=tokenizer.pad_id if hasattr(tokenizer, "pad_id") else 0,
+            retrieval_db=args.retrieval_db,
+            vocab_size=tokenizer.vocab_size,
+            stride=args.chunk_stride_size,
+        )
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+
+    for idx, json_file in enumerate(json_files):
+        print(f"Processing file {json_file} {idx + 1}/{len(json_files)}")
+        if json_file.endswith(".gz"):
+            fin = gzip.open(json_file, "r")
+        else:
+            fin = open(args.input, "r", encoding="utf-8")
+
+        encoded_docs = pool.imap(encoder.encode, fin, 25)
+
+        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+            total_bytes_processed += bytes_processed
+            for key, sentences in doc.items():
+                if len(sentences) == 0:
+                    continue
+                for sentence in sentences:
+                    builders[key].add_item(torch.IntTensor(sentence))
+                builders[key].end_document()
+            if i % args.log_interval == 0:
+                current = time.time()
+                elapsed = current - proc_start
+                mbs = total_bytes_processed / elapsed / 1024 / 1024
+                print(
+                    f"Processed {i} documents",
+                    f"({i/elapsed} docs/s, {mbs} MB/s).",
+                    file=sys.stderr,
+                )
+
+    for key in args.json_keys:
+        builders[key].finalize(output_idx_files[key])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_hf_dataset.slurm b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_hf_dataset.slurm
new file mode 100644
index 00000000..7d3da423
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_hf_dataset.slurm
@@ -0,0 +1,25 @@
+#!/bin/bash
+#SBATCH --output=logs/%x_%j.out  # Redirects outputs to file in current_dir/logs
+#SBATCH --error=logs/%x_%j.out  # Redirects err to same file in current_dir/logs
+#SBATCH --job-name=prep_hf_data
+#SBATCH --ntasks-per-node=1
+#SBATCH -N 1
+
+## Below examples for llama tokenizer
+
+## WIKICORPUS
+python prepare_hf_dataset.py --dataset_name wikicorpus \
+    --dataset_config_name raw_en \
+    --val_split_percentage 20 \
+    --hf_tokenizer_name meta-llama/Llama-2-7b-hf \
+    --seq_len 4096 \
+    --output_dir /fsx/datasets/temp/wikicorpus__raw_en/llama/4096/
+
+## C4
+# Had to delete a file which was incomplete and crashed the job
+# rm /fsx/datasets/.cache/datasets/downloads/extracted/741a4aaf04e7748f791ce4525c5876f13a45e8115d76b099c818cf7970972c48
+python prepare_hf_dataset.py --dataset_path /fsx/datasets/c4/en/hf \
+    --output_dir /fsx/datasets/temp/c4/en/hf-tokenized/llama \
+    --hf_tokenizer_name meta-llama/Llama-2-7b-hf \
+    --seq_len 4096 \
+    --val_split_percentage 20
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_nmt_dataset.slurm b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_nmt_dataset.slurm
new file mode 100644
index 00000000..e8c1f9ef
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prep_nmt_dataset.slurm
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --output=logs/%x_%j.out  # Redirects outputs to file in current_dir/logs
+#SBATCH --error=logs/%x_%j.out  # Redirects err to same file in current_dir/logs
+#SBATCH --job-name=prep_nmt_data
+#SBATCH --ntasks-per-node=1
+#SBATCH -N 1
+
+CONDA_ENV_PATH=${1:-"$CONDA_DEFAULT_ENV"}
+if [[ -z "${CONDA_ENV_PATH}" ]]; then
+    echo "Conda env not set, exiting"
+fi
+
+srun -l -D `pwd` conda run -p $CONDA_ENV_PATH --no-capture-output python data/prepare_nemo_megatron_dataset.py
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_hf_dataset.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_hf_dataset.py
new file mode 100644
index 00000000..6b6ce43a
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_hf_dataset.py
@@ -0,0 +1,186 @@
+import argparse
+import functools
+import logging
+import os
+from itertools import chain
+
+import torch
+import transformers
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from transformers.testing_utils import CaptureLogger
+
+# Either set token here or in the env
+# login(token="", add_to_git_credential=True, new_session=False)
+
+
+logger = logging.getLogger(__name__)
+
+"""
+Example commands
+----
+1. Wikicorpus for llama
+python prepare_hf_dataset.py --dataset_name wikicorpus \
+    --dataset_config_name raw_en \
+    --val_split_percentage 20 \
+    --hf_tokenizer_name meta-llama/Llama-2-7b-hf \
+    --seq_len 4096 \
+    --output_dir /fsx/datasets/wikicorpus__raw_en/llama/4096/
+
+2. C4
+# Had to delete a file which was incomplete
+# rm ~/.cache/huggingface/datasets/downloads/extracted/741a4aaf04e7748f791ce4525c5876f13a45e8115d76b099c818cf7970972c48
+python prepare_hf_dataset.py --dataset_path /fsx/datasets/c4/en/hf \
+    --output_dir /fsx/datasets/c4/en/hf-tokenized/llama \
+    --hf_tokenizer_name meta-llama/Llama-2-7b-hf \
+    --seq_len 4096 \
+    --val_split_percentage 20
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset_name", type=str, default=None)
+parser.add_argument("--dataset_config_name", type=str, default=None)
+parser.add_argument("--dataset_path", type=str, default=None)
+parser.add_argument("--val_split_percentage", type=int, default=20)
+parser.add_argument("--hf_tokenizer_name", type=str, default="meta-llama/Llama-2-7b-hf")
+parser.add_argument("--output_dir", default=None, type=str)
+parser.add_argument("--num_proc", default=64, type=int)
+parser.add_argument("--seq_len", type=int, default=4096)
+args, _ = parser.parse_known_args()
+
+if args.dataset_path is not None and (args.dataset_name is not None and args.dataset_config_name):
+    raise ValueError("Set either (dataset_path) or (dataset_name, dataset_config_name)")
+elif args.dataset_path is None:
+    if args.dataset_name is None or args.dataset_config_name is None:
+        raise ValueError(
+            "If dataset_path is not set, then both dataset_name and dataset_config_name need to be set"
+        )
+do_train = True
+do_eval = True
+
+
+def tokenize_function(tokenizer, text_column_name, examples):
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+
+    with CaptureLogger(tok_logger) as cl:
+        output = _tokenize_function(tokenizer, text_column_name, examples)
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
+            )
+    return output
+
+
+def _tokenize_function(tokenizer, text_column_name, examples):
+    return tokenizer(examples[text_column_name])
+
+
+# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+def group_texts(block_size, examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    if total_length >= block_size:
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+    else:
+        result = {}
+    return result
+
+
+def tokenize_dataset(
+    dataset_name,
+    dataset_config_name,
+    dataset_path,
+    hf_tokenizer_name,
+    output_dir,
+    val_split_percentage=20,
+    sequence_length=4096,
+    num_proc=64,
+    overwrite_cache=False,
+):
+    cache_dir = "/fsx/datasets/.cache/datasets/"
+    if dataset_path is not None:
+        raw_datasets = load_dataset(dataset_path, num_proc=num_proc, cache_dir=cache_dir)
+    else:
+        raw_datasets = load_dataset(
+            dataset_name, dataset_config_name, num_proc=num_proc, cache_dir=cache_dir
+        )
+
+    os.makedirs(output_dir, exist_ok=True)
+    train_split_percentage = 100 - val_split_percentage
+    if "validation" not in raw_datasets.keys():
+        raw_datasets["validation"] = load_dataset(
+            dataset_name,
+            dataset_config_name,
+            split=f"train[:{val_split_percentage}%]",
+            cache_dir=cache_dir,
+        )
+
+        raw_datasets["train"] = load_dataset(
+            dataset_name,
+            dataset_config_name,
+            split=f"train[:{train_split_percentage}%]",
+            cache_dir=cache_dir,
+        )
+
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_name, trust_remote_code=True)
+
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    # tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+
+    tokenized_datasets = raw_datasets.map(
+        functools.partial(tokenize_function, tokenizer, text_column_name),
+        batched=True,
+        num_proc=num_proc,
+        remove_columns=column_names,
+        load_from_cache_file=not overwrite_cache,
+        desc="Running tokenizer on dataset",
+    )
+
+    assert tokenizer.model_max_length >= sequence_length
+
+    lm_datasets = tokenized_datasets.map(
+        functools.partial(group_texts, sequence_length),
+        batched=True,
+        num_proc=num_proc,
+        load_from_cache_file=not overwrite_cache,
+        desc=f"Grouping texts in chunks of {sequence_length}",
+    )
+    if do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = lm_datasets["train"]
+        train_dataset.save_to_disk(f"{output_dir}/train/", num_proc=num_proc)
+
+    if do_eval:
+        if "validation" not in tokenized_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = lm_datasets["validation"]
+        eval_dataset.save_to_disk(f"{output_dir}/val/", num_proc=num_proc)
+
+    torch.save({"arguments": args}, f"{output_dir}/args")
+
+
+if __name__ == "__main__":
+    tokenize_dataset(
+        dataset_name=args.dataset_name,
+        dataset_config_name=args.dataset_config_name,
+        dataset_path=args.dataset_path,
+        hf_tokenizer_name=args.hf_tokenizer_name,
+        output_dir=args.output_dir,
+        val_split_percentage=args.val_split_percentage,
+        sequence_length=args.seq_len,
+        num_proc=args.num_proc,
+    )
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_nemo_megatron_dataset.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_nemo_megatron_dataset.py
new file mode 100644
index 00000000..243b49ca
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/prep/prepare_nemo_megatron_dataset.py
@@ -0,0 +1,39 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+SRC_DIR = "/fsx/datasets/c4/en/hf/"
+OUT_DIR = "/fsx/datasets/c4/en/nmt-tokenized-2/llama"
+
+if not Path(OUT_DIR).exists():
+    os.makedirs(OUT_DIR)
+
+
+def process_file(idx):
+    file_idx_str = str(idx).zfill(5)
+    file_stem = f"c4-train.{file_idx_str}-of-01024"
+    file_name = f"{file_stem}.json.gz"
+    cmd = f"python data/_prepare_nemo_megatron_dataset.py \
+                --input {os.path.join(SRC_DIR, file_name)} \
+                --output-prefix {OUT_DIR}/{file_stem} \
+                --tokenizer-library=huggingface \
+                --tokenizer-type hf-internal-testing/llama-tokenizer \
+                --dataset-impl mmap \
+                --append-eod \
+                --workers 32"
+    os.system(cmd)
+    output_partition_files = list(Path(OUT_DIR).glob(f"{file_stem}_[0-9]*"))
+    # Running with 2 partitions creates some extra files we don't need
+    for a_file in output_partition_files:
+        a_file.unlink()
+    input_partition_files = list(Path(SRC_DIR).glob(f"{file_stem}.json_[0-9].gz"))
+    for a_file in input_partition_files:
+        a_file.unlink()
+
+
+pool = ThreadPoolExecutor(max_workers=32)
+
+# import os
+# node_id = int(os.getenv('SLURM_NODEID'))
+# num_nodes = int(os.getenv('SLURM_NNODES'))
+threads = [pool.submit(process_file, idx) for idx in range(95, 256)]
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/data/utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/data/utils.py
new file mode 100644
index 00000000..79f41a2d
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/data/utils.py
@@ -0,0 +1,10 @@
+_S3_PREFIX = "s3://"
+
+
+def is_s3_source(src):
+    return src.startswith(_S3_PREFIX)
+
+
+def parse_s3_address(address):
+    address = address[len(_S3_PREFIX) :]
+    return address.split("/", 1)
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/fsdp_utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/fsdp_utils.py
new file mode 100644
index 00000000..cb14a7bb
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/fsdp_utils.py
@@ -0,0 +1,81 @@
+"""FSDP utils."""
+
+# pylint: disable=fixme,import-error,import-outside-toplevel,no-name-in-module
+from distutils.version import LooseVersion
+
+import torch
+from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
+from torch.sagemaker.logger import get_logger
+
+_logger = get_logger()
+
+
+def get_sharding_strategy(strategy: str):
+    """Get sharding strategy."""
+    sharding_strategy = getattr(ShardingStrategy, strategy.upper())
+    _logger.debug("Translating %s to %s.", strategy, sharding_strategy)
+    return sharding_strategy
+
+
+def get_backward_fetch_policy(policy: str):
+    """Get backward fetch policy."""
+    backward_fetch_policy = getattr(BackwardPrefetch, policy.upper())
+    _logger.debug("Translating %s to %s.", policy, backward_fetch_policy)
+    return backward_fetch_policy
+
+
+def get_transformer_layer(model_type="gpt2", use_smp_implementation=False):
+    """Get transformer layer."""
+    if use_smp_implementation:
+        # For pt-2.1-tsm-2.1 releases and below,
+        # We can't checkpoint our transformer.TransformerLayer class as it takes a tuple as input,
+        # so we checkpoint the te.TETransformerLayer directly instead.
+        # In later versions, we patch TransformerEngine activation checkpointing logic in our containers
+        # with some missing native PyTorch checkpoint logic and bug fixes to resolve this.
+        # PT ref: https://github.com/pytorch/pytorch/blob/v2.2.0/torch/utils/checkpoint.py#L307-L319
+        # TE ref: https://github.com/NVIDIA/TransformerEngine/blob/v1.2.1/transformer_engine/pytorch/distributed.py#L272
+        if LooseVersion(torch.__version__) >= LooseVersion("2.2.0"):
+            from torch.sagemaker.tensor_parallel.transformer import TransformerLayer
+
+            transformer_layer = TransformerLayer
+        else:
+            from torch.sagemaker.tensor_parallel.transformer import TETransformerLayer
+
+            transformer_layer = TETransformerLayer
+    elif model_type == "gpt2":
+        from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+
+        transformer_layer = GPT2Block
+
+    elif model_type == "gpt_neox":
+        from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
+
+        transformer_layer = GPTNeoXLayer
+
+    elif model_type == "bloom":
+        from transformers.models.bloom.modeling_bloom import BloomBlock
+
+        transformer_layer = BloomBlock
+
+    elif model_type == "flash_gptneox":
+        from flash_attn.modules.block import ParallelBlock
+
+        # TODO: Add support for Block
+        transformer_layer = ParallelBlock
+    elif model_type == "rubik_gpt_neox":
+        from smpv1.transformer import DistributedTransformerLayer
+
+        transformer_layer = DistributedTransformerLayer
+    elif model_type == "llama_v2":
+        from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+        transformer_layer = LlamaDecoderLayer
+    elif model_type == "mistral":
+        from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+
+        transformer_layer = MistralDecoderLayer
+    elif model_type == "mixtral":
+        from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
+
+        transformer_layer = MixtralDecoderLayer
+    return transformer_layer
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/learning_rates.py b/3.test_cases/17.SM-modelparallelv2/scripts/learning_rates.py
new file mode 100644
index 00000000..ccdf91dc
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/learning_rates.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate decay functions."""
+
+import math
+
+import torch.distributed as dist
+from logging_utils import get_logger
+
+logger = get_logger()
+
+# pylint: disable=invalid-name
+class AnnealingLR:  # pylint: disable=too-many-instance-attributes
+    """Anneals the learning rate."""
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        optimizer,
+        start_lr,
+        warmup_iter,
+        plateau_iter,
+        total_iters,
+        decay_style,
+        last_iter,
+        min_lr=0.0,
+        use_checkpoint_lr_scheduler=True,
+        override_lr_scheduler=False,
+    ):
+
+        # Class values.
+        self.optimizer = optimizer
+        self.start_lr = start_lr
+        self.min_lr = min_lr
+        self.warmup_iter = warmup_iter
+        self.plateau_iter = plateau_iter
+        self.num_iters = last_iter
+        self.end_iter = total_iters
+        assert self.end_iter > 0
+        self.decay_style = decay_style
+        self.override_lr_scheduler = override_lr_scheduler
+        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
+        if self.override_lr_scheduler:
+            assert not self.use_checkpoint_lr_scheduler, (
+                "both override and " "use-checkpoint are set."
+            )
+        # Set the learning rate
+        self.step(self.num_iters)
+        self.rank = dist.get_rank()
+
+    def get_lr(self):
+        """Learning rate decay functions from:
+        https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+
+        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Warmup.
+        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
+            return float(self.start_lr) * num_iters_ / self.warmup_iter
+
+        num_iters_ = num_iters_ - self.warmup_iter
+        if self.decay_style == "linear":
+            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+        elif self.decay_style == "plateau":
+            if self.num_iters <= self.plateau_iter:
+                lr = self.start_lr
+            else:
+                lr = (
+                    self.start_lr
+                    * (self.end_iter - self.num_iters)
+                    / (self.end_iter - self.plateau_iter)
+                )
+        elif self.decay_style == "cosine":
+            lr = self.start_lr / 2.0 * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
+        elif self.decay_style == "exponential":
+            # exp(-0.693) = 1/2
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
+        else:
+            lr = self.start_lr
+        return max(lr, self.min_lr)
+
+    def step(self, step_num=None):
+        """Set lr for all parameters groups."""
+        if step_num is None:
+            step_num = self.num_iters + 1
+        self.num_iters = step_num
+        new_lr = self.get_lr()
+        for group in self.optimizer.param_groups:
+            group["lr"] = new_lr
+
+    def state_dict(self):
+        """State dict."""
+        state_dict = {
+            "start_lr": self.start_lr,
+            "warmup_iter": self.warmup_iter,
+            "num_iters": self.num_iters,
+            "decay_style": self.decay_style,
+            "end_iter": self.end_iter,
+            "min_lr": self.min_lr,
+        }
+        return state_dict
+
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_lr_scheduler:
+            if self.rank == 0:
+                logger.info(f"Overriding {name} value to {cls_value}")
+            return cls_value
+
+        if not self.use_checkpoint_lr_scheduler:
+            assert (
+                cls_value == sd_value
+            ), f"AnnealingLR: class input value and checkpoint values for {name} do not match"
+        if self.rank == 0:
+            logger.info(f" > using checkpoint value {sd_value} for {name}")
+        return sd_value
+
+    def load_state_dict(self, sd):
+        """Load state dict."""
+        self.start_lr = self._check_and_set(self.start_lr, sd["start_lr"], "learning rate")
+        self.min_lr = self._check_and_set(self.min_lr, sd["min_lr"], "minimum learning rate")
+        self.warmup_iter = self._check_and_set(
+            self.warmup_iter, sd["warmup_iter"], "warmup iterations"
+        )
+        self.end_iter = self._check_and_set(
+            self.end_iter, sd["end_iter"], "total number of iterations"
+        )
+        self.decay_style = self._check_and_set(self.decay_style, sd["decay_style"], "decay style")
+
+        self.num_iters = sd["num_iters"]
+        self.step(self.num_iters)
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/logging_utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/logging_utils.py
new file mode 100644
index 00000000..4832937f
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/logging_utils.py
@@ -0,0 +1,175 @@
+"""Logging utils."""
+
+import logging
+import os
+from typing import Any, Dict, Optional
+
+import numpy as np
+import torch.distributed as dist
+
+_logger = None
+
+
+def create_args_table(args: Dict) -> str:
+    """Create args table."""
+    table_str = ""
+    table_header = "|" + "#" + "|" + "Arguments" + "|" + "Value" + "|" + "\n"
+    separator = "|-----" * 3 + '|' + "\n"
+    table_str += table_header + separator
+    for idx, (key, col) in enumerate(sorted(args.items())):
+        table_row = f"| {idx} | {key} | {col} |\n"
+        table_str += table_row
+    return table_str
+
+
+def get_logger():
+    """Get logger."""
+    global _logger
+    if _logger is None:
+        logging.getLogger("torch.distributed.checkpoint._dedup_tensors").setLevel(logging.ERROR)
+        logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.ERROR)
+        _logger = logging.getLogger(__name__)
+        _logger.setLevel(logging.INFO)
+        _logger.handlers = []
+        ch = logging.StreamHandler()
+        formatter = logging.Formatter(
+            "%(asctime)s %(levelname).1s " "[%(filename)s:%(lineno)d] %(message)s",
+            "%Y-%m-%d %H:%M:%S",
+        )
+        ch.setFormatter(formatter)
+        _logger.addHandler(ch)
+        _logger.propagate = False
+    return _logger
+
+
+def show_env_vars(rank: Optional[int] = 0):
+    """Show env vars."""
+    my_rank = dist.get_rank()
+
+    env_var = os.environ
+    if rank is None or my_rank == rank:
+        _logger.info("Env variables (len = %d):", len(env_var))
+
+        count = 0
+        for key, value in sorted(env_var.items()):
+            _logger.info(
+                "[%03d] env [%03d/%03d] %-20s: `%s`", my_rank, count, len(env_var), key, value
+            )
+            count += 1
+
+    keys = (
+        "HOSTNAME",
+        "SLURM_PROCID",
+    )
+    values = tuple(str(env_var.get(key)) for key in keys)
+    if my_rank % 8 == 0:  # Print from each node exactly once.
+        _logger.info("[%03d] env from all nodes `%s`: `%s`", my_rank, keys, values)
+
+
+def write_nccl_test_stats(
+    writers, report: Optional[Dict[str, Any]], prefix: str = "", step: int = -1
+) -> None:
+    """Write NCCL test stats."""
+
+    # 1. Different units and scale.
+    separate_fields = ("len",)
+    # 2. Bandwidth: Scalars and vectors.
+    stats_fields = (
+        "min", "min2", "min3", "min4", "min5", "max", "max2", "max3", "max4", "max5",
+        "mean", "median", "std"
+    )
+    vector_fields = ("data", "data_sorted")
+
+    for writer in writers:
+        for field in separate_fields:
+            if field in report:
+                writer.add_scalar(f"NCCLTest/{prefix}{field}", report[field], step)
+
+        # Bandwidth.
+        # - Scalars.
+        stats = {field: report[field] for field in stats_fields if field in report}
+        if stats:
+            _logger.info("NCCL test stats: `%s`.", stats)
+            writer.add_scalars(f"NCCLTest/{prefix}stats", stats, step)
+
+        # - Vectors.
+        for field in vector_fields:
+            if field not in report:
+                continue
+            vector = report[field]
+            _logger.info("NCCL test vectors (`%s`, len = %02d): `%s`.", field, len(vector), vector)
+
+            writer.add_histogram(f"NCCLTest/{prefix}{field}-hist", vector, step)
+            # When written as a scalar, its *max* step is written at the given step.
+            for index, value in enumerate(np.flip(vector)):
+                writer.add_scalar(f"NCCLTest/{prefix}{field}", value, step - index)
+
+
+def write_metrics_train_step(
+    writers, display_step, loss_scalar, throughput, tflops_per_gpu, current_lr, grad_norm
+):
+    """Write train metrics."""
+    for writer in writers:
+        writer.add_scalar("Loss/train", loss_scalar, display_step)
+        writer.add_scalar("Perf/SeqPerSec", throughput, display_step)
+        writer.add_scalar("Perf/ModelTFLOPs", tflops_per_gpu, display_step)
+        writer.add_scalar("LR/learning_rate", current_lr, display_step)
+        writer.add_scalar("Norms/grad_norm", grad_norm, display_step)
+
+
+def log_train_metrics(
+    args,
+    total_steps,
+    display_step,
+    loss_scalar,
+    throughput,
+    tflops_per_gpu,
+    current_lr,
+    grad_norm,
+    throughputs,
+    num_params,
+    world_size,
+    batch_seqlen,
+):
+    """Log train metrics."""
+    _logger.info(
+        "Batch %d Loss: %s, Speed: %.2f samples/sec, Model TFLOPS/GPU: %.2f, lr: %.6f, gradnorm: %.4f",  # pylint: disable=line-too-long
+        display_step,
+        loss_scalar,
+        throughput,
+        tflops_per_gpu,
+        current_lr,
+        grad_norm,
+    )
+
+    # Compute average throughput and tflops after 30 steps to remove
+    # high variance in initial steps
+    if len(throughputs) > 30 and not total_steps % args.logging_freq_for_avg:
+        avg_throughput = np.average(throughputs[30:])
+        from train_utils import compute_tflops
+
+        avg_tflops = compute_tflops(avg_throughput, num_params, world_size, batch_seqlen)
+        _logger.info(
+            "Batch %d Running Avg Speed: %.2f samples/sec, Running Avg Model TFLOPS/GPU: %.2f",  # pylint: disable=line-too-long
+            display_step,
+            avg_throughput,
+            avg_tflops,
+        )
+
+
+def log_and_write_eval_metrics(writers, display_step, val_loss, val_ppl):
+    """Log and write eval metrics."""
+    for writer in writers:
+        writer.add_scalar("Loss/val", val_loss, display_step)
+        writer.add_scalar("Loss/perplexity", val_ppl, display_step)
+
+    _logger.info(
+        "Batch %d Validation loss: %s",
+        display_step,
+        val_loss,
+    )
+    _logger.info(
+        "Batch %d Validation perplexity: %s",
+        display_step,
+        val_ppl,
+    )
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/memory_tracker.py b/3.test_cases/17.SM-modelparallelv2/scripts/memory_tracker.py
new file mode 100644
index 00000000..00ca1244
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/memory_tracker.py
@@ -0,0 +1,146 @@
+"""Memory tracker."""
+
+import os
+from typing import Any, Tuple
+
+import psutil
+import torch
+import torch.distributed as dist
+
+try:
+    from py3nvml import py3nvml
+except ImportError:
+    py3nvml = None
+
+# pylint: disable=global-statement
+dtype_to_bit = {
+    torch.float32: 32,
+    torch.float64: 64,
+    torch.float16: 16,
+    torch.bfloat16: 16,
+    torch.uint8: 8,
+    torch.int8: 8,
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+    torch.bool: 1,
+}
+
+process = psutil.Process(os.getpid())
+base_mem_usage = process.memory_info().data
+last_mem_usage = base_mem_usage
+
+_GB = 1024**3
+_FORMAT = "7.4f"
+
+
+def memory_status(  # pylint: disable=too-many-locals
+    tag: str = "",
+    reset_max: bool = True,
+    sync: bool = True,
+    writers: Tuple[Any] = (),
+    step: int = 0,
+) -> Tuple[float]:
+    """Memory status gpu."""
+    rank = dist.get_rank()
+    local_rank = rank % torch.cuda.device_count()
+
+    if rank > 0:
+        return 0., 0., 0., 0.
+
+    if sync:
+        torch.cuda.synchronize()
+
+    if py3nvml is not None:
+        py3nvml.nvmlInit()
+        handle = py3nvml.nvmlDeviceGetHandleByIndex(local_rank)
+        info = py3nvml.nvmlDeviceGetMemoryInfo(handle)
+        total_used = info.used / _GB
+        total_used_str = f"Totally used GPU memory: {total_used} GB."
+    else:
+        total_used_str = ""
+
+    # Convert to GB for printing.
+    alloced = torch.cuda.memory_allocated(device=local_rank) / _GB
+    max_alloced = torch.cuda.max_memory_allocated(device=local_rank) / _GB
+    cached = torch.cuda.memory_reserved(device=local_rank) / _GB
+    max_cached = torch.cuda.max_memory_reserved(device=local_rank) / _GB
+
+    print(
+        f"[GPU MEMORY]@{step:04d} "
+        f"(torch, rank, device) = ({torch.__version__}, {rank}, {local_rank}), "
+        f"(alloc, max_alloc, cache, max_cache) = ({alloced:{_FORMAT}}, {max_alloced:{_FORMAT}}, "
+        f"{cached:{_FORMAT}}, {max_cached:{_FORMAT}}) GB. "
+        f"{total_used_str} [{tag:10s}]",
+    )
+
+    if reset_max:
+        torch.cuda.reset_peak_memory_stats()
+
+    if py3nvml is not None:
+        py3nvml.nvmlShutdown()
+
+    usage = {
+        "allocated": alloced,
+        "max_allocated": max_alloced,
+        "max_reserved": max_cached,
+        "reserved": cached,
+    }
+    for writer in writers:
+        writer.add_scalars(f"GPUMemoryGB/{tag}", usage, step)
+
+    return alloced, max_alloced, cached, max_cached
+
+
+def memory_status_cpu(  # pylint: disable=too-many-locals
+    tag: str = "", writers: Tuple[Any] = (), step: int = 0
+) -> Tuple[float]:
+    """Memory status cpu."""
+    rank = dist.get_rank()
+    local_rank = rank % torch.cuda.device_count()
+
+    if rank > 0:
+        return 0., 0., 0., 0.
+
+    import gc  # pylint: disable=import-outside-toplevel
+
+    global last_mem_usage
+    global base_mem_usage  # pylint: disable=global-variable-not-assigned
+
+    gc.collect()
+    gc.collect()
+    gc.collect()
+    objects = gc.get_objects()
+    tensors = [obj for obj in objects if isinstance(obj, torch.Tensor) and not obj.is_cuda]
+    torch_usage = 0
+    for t in tensors:  # pylint: disable=invalid-name
+        torch_usage += t.numel() * dtype_to_bit[t.dtype]
+    # total_usage = psutil.virtual_memory()[3] # This will get the total usage for all processes
+    current_usage = process.memory_info().data
+    total_usage = current_usage - base_mem_usage
+    usage_change = current_usage - last_mem_usage
+    last_mem_usage = current_usage
+
+    torch_usage /= _GB
+    total_usage /= _GB
+    usage_change /= _GB
+    base_usage = base_mem_usage / _GB
+
+    print(
+        f"[CPU MEMORY]@{step:04d} "
+        f"(torch, rank, device) = ({torch.__version__}, {rank}, {local_rank}), "
+        f"(torch tensor, mem, change since last measurement, base) = ({torch_usage:{_FORMAT}}, "
+        f"{total_usage:{_FORMAT}}, {usage_change:{_FORMAT}}, {base_usage:{_FORMAT}}): "
+        f"{tag}"
+    )
+
+    usage = {
+        "base": base_usage,
+        "delta": usage_change,
+        "torch": torch_usage,
+        "total": total_usage,
+    }
+    for writer in writers:
+        writer.add_scalars(f"CPUMemoryGB/{tag}", usage, step)
+
+    return torch_usage, total_usage, usage_change, base_usage
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/sm_env_utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/sm_env_utils.py
new file mode 100644
index 00000000..b6fe1113
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/sm_env_utils.py
@@ -0,0 +1,17 @@
+"""SageMaker environment utils."""
+import os
+
+SM_ENV_KEY = "TRAINING_JOB_ARN"
+
+
+def enable_dummy_sm_env():
+    """
+    Sets up dummy environment variable
+    to handle SageMaker platform guardrail.
+
+    Necessary for both Rubik and Herring.
+    """
+    if os.environ.get(SM_ENV_KEY, None) is None:
+        # Set the SageMaker environment variable to a dummy value
+        # if not set.
+        os.environ[SM_ENV_KEY] = "0"
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/test_utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/test_utils.py
new file mode 100644
index 00000000..67e8d144
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/test_utils.py
@@ -0,0 +1,131 @@
+"""Unit test for utils.py."""
+
+from typing import Any, Dict, Optional
+import unittest
+
+import numpy as np
+from parameterized import parameterized
+import utils
+
+
+class TestUtils(unittest.TestCase):
+    """Unit test for the Utils class."""
+
+    @parameterized.expand((
+        (
+            None,
+            None,
+        ),
+        (
+            np.array([]),
+            None,
+        ),
+        (
+            np.array([1.,]),
+            {
+                "data": np.array([1.,]),
+                "data_sorted": np.array([1.,]),
+                "len": 1,
+                # Stats.
+                "max": 1.,
+                "mean": 1.,
+                "median": 1.,
+                "min": 1.,
+                "std": 0.,
+            },
+        ),
+        (
+            np.array([9., 1.]),
+            {
+                "data": np.array([9., 1.]),
+                "data_sorted": np.array([1., 9.]),
+                "len": 2,
+                # Stats.
+                "max": 9.,
+                "max2": 1.,
+                "mean": 5.,
+                "median": 5.,
+                "min": 1.,
+                "min2": 9.,
+                "std": 4,
+            },
+        ),
+        (
+            np.array([3., 1., 8.]),
+            {
+                "data": np.array([3., 1., 8.]),
+                "data_sorted": np.array([1., 3., 8.]),
+                "len": 3,
+                # Stats.
+                "max": 8.,
+                "max2": 3.,
+                "max3": 1.,
+                "mean": 4.,
+                "median": 3.,
+                "min": 1.,
+                "min2": 3.,
+                "min3": 8.,
+                "std": 2.943920288775949,
+            },
+        ),
+        # Actual NCCL tests E2E.
+        (
+            utils.parse_nccl_test_log("./scripts/FILE-DOES-NOT-EXIST.txt"),
+            None,
+        ),
+        (
+            utils.parse_nccl_test_log("./scripts/nccl_test_ns02.txt"),
+            {
+                "data": np.array([29.8872, 28.9057]),
+                "data_sorted": np.array([28.9057, 29.8872]),
+                "len": 2,
+                # Stats.
+                "max": 29.8872,
+                "max2": 28.9057,
+                "mean": 29.39645,
+                "median": 29.39645,
+                "min": 28.9057,
+                "min2": 29.8872,
+                "std": 0.49075000000000024,
+            },
+        ),
+        (
+            utils.parse_nccl_test_log("./scripts/nccl_test_ns04.txt"),
+            {
+                "data": np.array([3.68419, 3.67744, 3.67917, 3.6783]),
+                "data_sorted": np.array([3.67744, 3.6783, 3.67917, 3.68419]),
+                "len": 4,
+                # Stats.
+                "max": 3.68419,
+                "max2": 3.67917,
+                "max3": 3.6783,
+                "max4": 3.67744,
+                "mean": 3.6797750000000002,
+                "median": 3.678735,
+                "min": 3.67744,
+                "min2": 3.6783,
+                "min3": 3.67917,
+                "min4": 3.68419,
+                "std": 0.002621359380169051,
+            },
+        ),
+    ))
+    def test_get_nccl_test_report(
+        self,
+        bandwidth: Optional[np.ndarray],
+        expected_report: Optional[Dict[str, Any]],
+    ):
+        """Unit test for get_nccl_test_report."""
+        report = utils.get_nccl_test_report(bandwidth)
+
+        if expected_report is None:
+            self.assertIsNone(report)
+            return
+
+        for key in ("data", "data_sorted"):
+            np.testing.assert_allclose(report.pop(key), expected_report.pop(key))
+        self.assertEqual(report, expected_report)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/train_external.py b/3.test_cases/17.SM-modelparallelv2/scripts/train_external.py
new file mode 100644
index 00000000..18ba3c40
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/train_external.py
@@ -0,0 +1,16 @@
+"""Train.py."""
+
+import os
+os.environ["NVTE_TORCH_COMPILE"] = "0"
+
+from arguments import parse_args
+import train_lib
+
+def main():
+    """Main function to train GPT."""
+    args, _ = parse_args()
+    train_lib.main(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/train_lib.py b/3.test_cases/17.SM-modelparallelv2/scripts/train_lib.py
new file mode 100644
index 00000000..52c9555f
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/train_lib.py
@@ -0,0 +1,717 @@
+"""Train lib function."""
+import datetime
+import functools
+import math
+import re
+import time
+from contextlib import nullcontext
+
+# pylint: disable=fixme,import-error,import-outside-toplevel,invalid-name,no-name-in-module,wrong-import-order
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.sagemaker as tsm
+import torch.utils.data
+
+import transformer_engine
+from transformer_engine.common.recipe import Format, DelayedScaling
+
+import transformers
+from accelerate import init_empty_weights
+from checkpoints import (
+    _CHECKPOINT_DIR_REGEX,
+    _DEFAULT_STATE_DICT_TYPE,
+    CheckpointingMethod,
+    get_coordinator_rank,
+    is_action_rank,
+    load_checkpoint,
+    save_checkpoint,
+)
+from data.pipelines import GPTDataPipeline, create_data_pipeline
+from fsdp_utils import get_backward_fetch_policy, get_sharding_strategy, get_transformer_layer
+from logging_utils import (
+    create_args_table,
+    get_logger,
+    log_and_write_eval_metrics,
+    log_train_metrics,
+    show_env_vars,
+    write_nccl_test_stats,
+    write_metrics_train_step,
+)
+from memory_tracker import memory_status, memory_status_cpu
+from packaging import version as pversion
+from torch import optim
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
+from torch.sagemaker import transform
+from torch.sagemaker.delayed_param import DelayedParamIniter
+from torch.sagemaker.grad_norm import clip_grad_norm_
+from torch.sagemaker.utils import utils as tsm_utils  # pylint: disable=no-name-in-module
+from train_utils import (
+    apply_activation_checkpoint,
+    compute_num_params,
+    compute_tflops,
+    create_model,
+    get_learning_rate_scheduler,
+    get_model_config,
+    get_param_groups_by_weight_decay,
+    patch_neox_rope,
+)
+from transformers import set_seed
+import utils
+
+logger = get_logger()
+
+
+def finetune_with_pretrained_weights_check(args) -> bool:
+    # returns True for start of finetuning only
+    return args.hf_pretrained_model_name_or_dir is not None and args.resume_from_checkpoint is None
+
+
+def finetune_check(args):
+    # returns True for start of finetuning as well as resuming
+    return args.hf_pretrained_model_name_or_dir is not None
+
+
+def eval_model(model, data_pipeline, num_batches):
+    """Eval step."""
+    model = model.eval()
+    n_batches = 0
+    loss = 0.0
+
+    with torch.no_grad():
+        for batch_idx, input_data in enumerate(data_pipeline.val_dataloader):
+            input_ids, _ = data_pipeline.get_val_batch(input_data)
+
+            if batch_idx >= num_batches:
+                break
+
+            loss += model(input_ids=input_ids, attention_mask=None, labels=input_ids)["loss"]
+            n_batches += 1
+
+    if n_batches > 0:
+        detached_loss = loss.detach()
+        torch.distributed.all_reduce(detached_loss)
+        loss = detached_loss.item() / dist.get_world_size()
+        loss /= n_batches
+        ppl = math.exp(loss)
+    else:
+        loss = -1.0
+        ppl = -1.0
+
+    return loss, ppl
+
+
+def reduce_loss(loss):
+    loss_detached = loss.detach()
+    dist.all_reduce(loss_detached)
+    loss_scalar = loss_detached.item() / dist.get_world_size()
+    return loss_scalar
+
+
+def train_step(  # pylint: disable=too-many-arguments,too-many-branches,too-many-locals
+    args, display_step: int, batch_idx: int, nvtx_warmup_iters,
+    data_pipeline, input_data, model, optimizer, lr_scheduler, writers, fp8_recipe
+):
+    if batch_idx >= nvtx_warmup_iters:
+        torch.cuda.nvtx.range_push(f"iteration{batch_idx}")
+
+    input_ids, _, labels = data_pipeline.get_batch(input_data)
+
+    if batch_idx == 0:
+        # checking only on batch 0 to reduce checks during runtime
+        assert (
+            input_ids.shape[1] == args.max_context_width
+        ), f"Input data passed {input_ids.shape} does not respect max_context_width set. Note that this is not strictly necessary, but added to prevent mistakes. If you intend to do this, please remove this check."
+        assert (
+            input_ids.shape[1] <= args.max_context_width
+        ), "Input data passed is larger than max_context_width for model. You need to change max_context_width so model can expect larger sequences"
+
+    optimizer.zero_grad(set_to_none=True)
+
+    torch.cuda.synchronize()
+    step_start = time.time()
+
+    if batch_idx >= nvtx_warmup_iters:
+        torch.cuda.nvtx.range_push("forward")
+
+    # uses default causal mask
+    if args.fp8==1 and args.use_smp_implementation==1:
+        with transformer_engine.pytorch.fp8_autocast(enabled=args.fp8, fp8_recipe=fp8_recipe, fp8_group=tsm.state.world_process_group):
+            loss = model(input_ids=input_ids, attention_mask=None, labels=labels)["loss"]
+    else:
+        loss = model(input_ids=input_ids, attention_mask=None, labels=labels)["loss"]
+
+    if batch_idx >= nvtx_warmup_iters:
+        # for forward
+        torch.cuda.nvtx.range_pop()
+
+    if args.enable_memory_profiling > 0 and batch_idx < 5:
+        memory_status_cpu("After forward", writers=writers, step=display_step)
+        memory_status(tag="After forward", writers=writers, step=display_step)
+
+    if batch_idx >= nvtx_warmup_iters:
+        torch.cuda.nvtx.range_push("backward")
+
+    loss.backward()
+
+    if batch_idx >= nvtx_warmup_iters:
+        # for backward
+        torch.cuda.nvtx.range_pop()
+
+    if args.enable_memory_profiling > 0 and batch_idx < 5:
+        memory_status_cpu("After train step", writers=writers, step=display_step)
+        memory_status(tag="After train step", writers=writers, step=display_step)
+
+    if batch_idx >= nvtx_warmup_iters:
+        torch.cuda.nvtx.range_push("opt_step")
+
+    grad_norm = clip_grad_norm_(model, args.grad_clip)
+    optimizer.step()
+    lr_scheduler.step()
+
+    if batch_idx >= nvtx_warmup_iters:
+        # for opt step
+        torch.cuda.nvtx.range_pop()
+
+    if args.clean_cache > 0:
+        # empty the cache to avoid OOM
+        torch.cuda.empty_cache()
+
+    if batch_idx >= nvtx_warmup_iters:
+        # for step
+        torch.cuda.nvtx.range_pop()
+
+    torch.cuda.synchronize()
+    step_time = time.time() - step_start
+
+    if args.enable_memory_profiling > 0 and batch_idx < 5:
+        memory_status(tag="After opt step", writers=writers, step=display_step)
+
+    batch_num_sequences = input_ids.shape[0]
+    batch_seqlen = input_ids.shape[1]
+    return loss, step_time, batch_num_sequences, batch_seqlen, grad_norm
+
+
+# pylint: disable=no-member,too-many-arguments,too-many-branches,too-many-locals,too-many-statements
+def train(
+    model,
+    optimizer,
+    lr_scheduler,
+    writers,
+    model_config,
+    start_epoch,
+    start_train_path_index,
+    resume_from_sequence_number,
+    num_params,
+    total_steps,
+    args,
+    global_rank,
+    world_size,
+    checkpointing_pg_metadata,
+    fp8_recipe,
+):
+    """Train."""
+    if args.enable_memory_profiling > 0:
+        memory_status_cpu(tag="Before train step", writers=writers, step=total_steps - 1)
+
+    model.train()
+    dp_rank = global_rank
+    dp_size = world_size
+
+    if args.tensor_parallel_degree > 1:
+        dp_rank //= args.tensor_parallel_degree
+        dp_size //= args.tensor_parallel_degree
+
+    if global_rank == 0:
+        logger.info("Creating train dataloader")
+
+    throughputs = []
+    # Set the same seed for computation
+    set_seed(args.seed)
+
+    data_pipeline = create_data_pipeline(
+        args, start_train_path_index, resume_from_sequence_number, dp_rank, dp_size
+    )
+    cur_seq_index = resume_from_sequence_number
+    epoch = start_epoch
+    while total_steps < args.max_steps:
+        nvtx_warmup_iters = 3
+        if global_rank == 0:
+            logger.info("Starting training with epoch %s.", epoch)
+
+        # additional loop around is for GPTDataset as there can be multiple dataloaders
+        if isinstance(data_pipeline, GPTDataPipeline):
+            # with new path if incremented at the end of this for loop
+            data_pipeline.create_train_dataset()
+
+        for batch_idx, input_data in enumerate(data_pipeline.train_dataloader):
+            if total_steps >= args.max_steps:
+                break
+
+            if args.profile_nsys > 0 and batch_idx == nvtx_warmup_iters:
+                torch.cuda.cudart().cudaProfilerStart()
+
+            loss, step_time, batch_num_sequences, batch_seqlen, grad_norm = train_step(
+                args,
+                total_steps,
+                batch_idx,
+                nvtx_warmup_iters,
+                data_pipeline,
+                input_data,
+                model,
+                optimizer,
+                lr_scheduler,
+                writers,
+                fp8_recipe,
+            )
+            total_steps += 1
+            cur_seq_index += batch_num_sequences
+            sample_processed = batch_num_sequences * dp_size
+            throughput = sample_processed / step_time
+            throughputs.append(throughput)
+
+            tflops_per_gpu = compute_tflops(throughput, num_params, world_size, batch_seqlen)
+
+            if not total_steps % args.logging_freq and args.log_reduced_training_loss > 0:
+                loss_scalar = reduce_loss(loss)
+            else:
+                loss_scalar = loss.item()
+
+            current_lr = lr_scheduler.get_lr()
+            display_step = total_steps - 1
+            if global_rank == 0:
+                write_metrics_train_step(
+                    writers,
+                    display_step,
+                    loss_scalar,
+                    throughput,
+                    tflops_per_gpu,
+                    current_lr,
+                    grad_norm,
+                )
+                if not total_steps % args.logging_freq:
+                    log_train_metrics(
+                        args,
+                        total_steps,
+                        display_step,
+                        loss_scalar,
+                        throughput,
+                        tflops_per_gpu,
+                        current_lr,
+                        grad_norm,
+                        throughputs,
+                        num_params,
+                        world_size,
+                        batch_seqlen,
+                    )
+
+            # evaluate on validation
+            if args.validation_freq and not total_steps % args.validation_freq:
+                cur_state = np.random.get_state()
+                torch.cuda.empty_cache()
+                val_loss, val_ppl = eval_model(model, data_pipeline, args.validation_batches)
+                if global_rank == 0:
+                    log_and_write_eval_metrics(writers, display_step, val_loss, val_ppl)
+                model = model.train()
+                if args.preserve_np_state > 0:
+                    np.random.set_state(cur_state)
+
+            # checkpoint
+            if not total_steps % args.checkpoint_freq[0]:
+
+                if isinstance(data_pipeline, GPTDataPipeline):
+                    save_train_path_index = data_pipeline.cur_train_path
+                else:
+                    save_train_path_index = 0
+                save_train_seq_index = cur_seq_index
+                # technically we have processed save_train_seq_index sequences in this file
+                # and so index to start from is save_train_seq_index
+                user_content = {
+                    "cli_args": args.__dict__,
+                    "model_config": model_config,
+                    "num_params": num_params,
+                    "total_steps": total_steps,
+                    "epoch": epoch,
+                    "start_train_path_index": save_train_path_index,
+                    "resume_from_sequence_number": save_train_seq_index,
+                }
+
+                subdir = f"{args.model_type}-{total_steps}steps"
+                if global_rank == 0 and not re.match(_CHECKPOINT_DIR_REGEX, subdir):
+                    raise ValueError(
+                        f"Please double check hard-coded checkpoint subdir pattern: `{subdir}` "
+                        f"not matching `{_CHECKPOINT_DIR_REGEX}`."
+                    )
+
+                if args.enable_memory_profiling > 0:
+                    msg = f"({_DEFAULT_STATE_DICT_TYPE})"
+                    memory_status(tag=f"Before ckpt {msg}", writers=writers, step=display_step)
+                save_checkpoint(
+                    model,
+                    optimizer,
+                    lr_scheduler,
+                    user_content,
+                    get_sharding_strategy(args.sharding_strategy),
+                    args.checkpoint_dir[0],
+                    subdir,
+                    args.num_kept_checkpoints[0],
+                    checkpointing_pg_metadata,
+                    tensor_parallel_degree=int(args.tensor_parallel_degree),
+                    checkpoint_type=args.checkpoint_type,
+                )
+                if args.enable_memory_profiling > 0:
+                    msg = f"({_DEFAULT_STATE_DICT_TYPE})"
+                    memory_status(tag=f"After ckpt {msg}", writers=writers, step=display_step)
+
+        if isinstance(data_pipeline, GPTDataPipeline):
+            incremented_in_epoch = data_pipeline.increment_path_in_epoch()
+            if not incremented_in_epoch:
+                # path index set to 0
+                epoch += 1
+        else:
+            epoch += 1
+    # Using median throughput across all steps, could be more robust.
+    return total_steps, np.median(throughputs) if throughputs else 0
+
+
+@record
+def main(args):
+    """Main function to train GPT."""
+    global_start_time = time.time()
+
+    # Sanity check for args.
+    # - Checkpoints.
+    # TODO(sliuxl): Supporting one single checkpoint dir now, and multiple dirs support is missing.
+    ckpt_lens = (
+        len(args.checkpoint_dir),
+        len(args.checkpoint_freq),
+        len(args.num_kept_checkpoints),
+    )
+    if len(set(ckpt_lens)) != 1:
+        raise ValueError(f"Len mismtach for checkpoint dir, freq vs num to keep:  {ckpt_lens}.")
+
+    if args.distributed_backend == "smddp":
+        import smdistributed.dataparallel.torch.torch_smddp  # pylint: disable=unused-import
+
+    dist.init_process_group(args.distributed_backend, timeout=datetime.timedelta(seconds=7200))
+    global_rank = dist.get_rank()
+    device = global_rank % torch.cuda.device_count()
+    world_size = dist.get_world_size()
+
+    if args.tensorboard_dir and global_rank == 0:
+        from torch.utils.tensorboard import SummaryWriter
+
+        logger.info("Writing metrics for tensorboard to %s.", args.tensorboard_dir)
+        writers = tuple(SummaryWriter(log_dir=tb_dir) for tb_dir in args.tensorboard_dir)
+        table_str = create_args_table(args.__dict__)
+        for writer in writers:
+            writer.add_text("Arguments", table_str)
+    else:
+        writers = ()
+
+    if args.nccl_test_log:
+        report = utils.get_nccl_test_report(utils.parse_nccl_test_log(args.nccl_test_log))
+        if report is not None and global_rank == 0:
+            write_nccl_test_stats(writers, report)
+
+    smp_config_dict = {                                                 # REMOVE_IN_PUBLIC_NOTEBOOK
+        "activation_loading_horizon": args.activation_loading_horizon,  # REMOVE_IN_PUBLIC_NOTEBOOK
+        "sm_activation_offloading": args.offload_activations > 0,       # REMOVE_IN_PUBLIC_NOTEBOOK
+    }                                                                   # REMOVE_IN_PUBLIC_NOTEBOOK
+    if args.shard_degree is not None:                                   # REMOVE_IN_PUBLIC_NOTEBOOK
+        smp_config_dict["hybrid_shard_degree"] = args.shard_degree      # REMOVE_IN_PUBLIC_NOTEBOOK
+    smp_config_dict["tensor_parallel_degree"] = args.tensor_parallel_degree # REMOVE_IN_PUBLIC_NOTEBOOK
+    tsm.init(smp_config_dict)  # NOTE: Please leave `tsm.init` before `args.tensor_parallel_degree` being used, otherwise notebook example might have issues.
+
+    if args.use_smp_implementation < 1 < args.tensor_parallel_degree:
+        args.use_smp_implementation = 1
+        if global_rank == 0:
+            logger.info(
+                "Tensor parallelism is enabled as tensor_parallel_degree is set to %d > 1. "
+                "Switching use_smp_implementation to 1 so we can use SMP optimized implementation.",
+                args.tensor_parallel_degree
+            )
+    if args.use_smp_implementation:
+        # For our Mem usage fix to TE, this needs to be True
+        args.use_orig_params = 1
+
+    if args.use_synthetic_data and args.validation_freq is not None:
+        # Overriding validation freq to None as synthetic data
+        args.validation_freq = None
+
+    show_env_vars(0)
+
+    if global_rank == 0:
+        for index, (key, value) in enumerate(sorted(args.__dict__.items()), 1):
+            logger.info("Arguments [%03d/%03d] %-30s: %s", index, len(args.__dict__), key, value)
+        logger.info("Transformers version: %s", transformers.__version__)
+        logger.info("World size = %d: # nodes = %d.", world_size, world_size / 8)
+
+        gbs = (
+            world_size
+            * args.max_context_width
+            * args.train_batch_size
+            / args.tensor_parallel_degree
+        )
+        logger.info("Global batch size in tokens: %10d (%5.2fM).", gbs, gbs / 1024 ** 2)
+
+    set_seed(args.seed)
+
+    if args.enable_memory_profiling > 0:
+        memory_status_cpu(tag="Before model creation", writers=writers)
+
+    if args.bf16:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.get_default_dtype()
+
+    if finetune_check(args):
+        from transformers import AutoConfig
+
+        # Using config for finetune mode, else uses args to create model
+        model_config = AutoConfig.from_pretrained(args.hf_pretrained_model_name_or_dir)
+    else:
+        model_config = get_model_config(args)
+
+    delayed_param_initer = None
+    with tsm_utils.timeit(True, "Model creation", global_rank):
+        if args.delayed_param:
+            if finetune_with_pretrained_weights_check(args) and dist.get_rank() == 0:
+                # create model with pretrained weights on one rank even if we want to use
+                # delayed param, param init on other ranks will still be delayed
+                model = create_model(
+                    args,
+                    model_config=model_config,
+                    dtype=dtype,
+                    pretrained_model_weights=args.hf_pretrained_model_name_or_dir
+                    if finetune_with_pretrained_weights_check(args)
+                    else None,
+                )
+                num_params = compute_num_params(model)
+            else:
+                with init_empty_weights():
+                    model = create_model(
+                        args,
+                        model_config=model_config,
+                        dtype=dtype,
+                    )
+                num_params = compute_num_params(model)
+            if finetune_check(args):
+                dist.barrier()
+        else:
+            model = create_model(
+                args,
+                model_config=model_config,
+                dtype=dtype,
+                pretrained_model_weights=args.hf_pretrained_model_name_or_dir
+                if finetune_with_pretrained_weights_check(args) and dist.get_rank() == 0
+                else None,
+            )
+            num_params = compute_num_params(model)
+
+        if args.use_smp_implementation:
+            load_state_dict_from_rank0 = finetune_with_pretrained_weights_check(args)
+            model = transform(model, load_state_dict_from_rank0=load_state_dict_from_rank0)
+
+        if args.delayed_param:
+            # param init fn for delayed param creation
+            if finetune_check(args):
+                if dist.get_rank() != 0:
+                    delayed_param_initer = DelayedParamIniter(model)
+            else:
+                delayed_param_initer = DelayedParamIniter(model)
+
+    assert set(x.dtype for x in model.parameters()) == set(
+        [torch.float32]
+    ), "Model parameters should be in fp32 for FSDP mixed precision"
+
+    if global_rank == 0:
+        logger.info(
+            "Created model with total parameters: %d (%.2f B)", num_params, num_params * 1e-9
+        )
+
+    transformer_layer = get_transformer_layer(args.model_type, args.use_smp_implementation)
+
+    if args.auto_wrap_policy == "transformer_auto_wrap_policy":
+        gpt_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                transformer_layer,
+            },
+        )
+    elif args.auto_wrap_policy == "size_based_auto_wrap_policy":
+        gpt_auto_wrap_policy = functools.partial(
+            size_based_auto_wrap_policy,
+        )
+
+    torch.cuda.set_device(device)
+    if args.bf16:
+        # buffer set to fp32 as some models in HF such as llama hard code buffers to fp32
+        # to be similar with that we set this to fp32
+        buffer_dtype = torch.float32 if args.use_smp_implementation else dtype
+        mixed_precision_policy = MixedPrecision(
+            param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=buffer_dtype
+        )
+    else:
+        mixed_precision_policy = None
+
+    if args.enable_memory_profiling > 0:
+        memory_status_cpu(tag="Before FSDP wrapper", writers=writers)
+
+    sharding_strategy = get_sharding_strategy(args.sharding_strategy)
+
+    with (
+        delayed_param_initer.validate_params_and_buffers_inited()
+        if (delayed_param_initer and not finetune_with_pretrained_weights_check(args))
+        else nullcontext(),
+        tsm_utils.timeit(True, "FSDP constructor", global_rank),
+    ):
+        model = FSDP(  # pylint: disable=unexpected-keyword-arg
+            model,
+            auto_wrap_policy=gpt_auto_wrap_policy,
+            mixed_precision=mixed_precision_policy,
+            sharding_strategy=sharding_strategy,
+            backward_prefetch=get_backward_fetch_policy(args.backward_fetch_policy),
+            forward_prefetch=args.forward_prefetch,
+            limit_all_gathers=args.limit_all_gathers,
+            device_id=torch.cuda.current_device(),
+            use_orig_params=args.use_orig_params > 0,
+            param_init_fn=delayed_param_initer.get_param_init_fn()
+            if delayed_param_initer
+            else None,
+            post_param_init_fn=delayed_param_initer.get_post_param_init_fn()
+            if delayed_param_initer
+            else None,
+            sync_module_states=finetune_with_pretrained_weights_check(args),
+        )
+    # Barrier is a workaround to reduce extra memory usage with SMDDP backend
+    # after the broadcast that happens when we use sync_module_states
+    # This can be removed once the SMDDP issue is fixed
+    dist.barrier()
+
+    if global_rank == 0:
+        logger.info("Wrapped model with FSDP")
+
+    if args.enable_memory_profiling > 0:
+        memory_status(tag="After FSDP wrapper", writers=writers)
+
+    fp8_recipe = None
+    if args.fp8==1 and args.use_smp_implementation==1:
+        fp8_recipe = DelayedScaling(fp8_format=Format.HYBRID, amax_history_len=args.fp8_amax_history_len, amax_compute_algo=args.fp8_amax_compute_algo)
+
+    if args.activation_checkpointing > 0:
+        apply_activation_checkpoint(args, model=model)
+
+    if args.offload_activations > 0:
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper
+
+        model = offload_wrapper(model)
+
+        # Patch RoPE for GPT NEoX where they are created on Host to move them to Device
+        if args.use_smp_implementation == 0 and args.model_type == "gpt_neox" and args.patch_neox_rope > 0:
+            patch_neox_rope(model)
+
+    param_groups = get_param_groups_by_weight_decay(model)
+
+    optimizer = optim.AdamW(
+        param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay
+    )
+
+    if global_rank == 0:
+        logger.info("Created optimizer")
+
+    lr_scheduler = get_learning_rate_scheduler(optimizer, args)
+
+    checkpointing_pg_metadata = (
+        model.process_group,
+        get_coordinator_rank(model.process_group),
+        is_action_rank(global_rank),
+    )
+
+    if args.resume_from_checkpoint:
+        (
+            model,
+            optimizer,
+            lr_scheduler,
+            epoch,
+            total_steps,
+            start_train_path_index,
+            resume_from_sequence_number,
+        ) = load_checkpoint(
+            args,
+            model,
+            optimizer,
+            lr_scheduler,
+            args.resume_from_checkpoint,
+            sharding_strategy,
+            checkpointing_pg_metadata,
+            tensor_parallel_degree=int(args.tensor_parallel_degree),
+            checkpoint_type=args.checkpoint_type,
+        )
+
+    else:
+        total_steps = 0
+        epoch = 0
+        start_train_path_index = 0
+        resume_from_sequence_number = 0
+
+    train_start_time = time.time()
+    # total_steps, throughput, loss
+    total_steps, _ = train(
+        model,
+        optimizer,
+        lr_scheduler,
+        writers,
+        model_config,
+        epoch,
+        start_train_path_index,
+        resume_from_sequence_number,
+        num_params,
+        total_steps,
+        args,
+        global_rank,
+        world_size,
+        checkpointing_pg_metadata,
+        fp8_recipe,
+    )
+    time_now = time.time()
+    total_sec = time_now - global_start_time
+    train_sec = time_now - train_start_time
+
+    dist.barrier()
+
+    if args.save_final_model:
+        save_checkpoint(
+            model,
+            None,
+            None,
+            {"model_config": model_config},
+            None,
+            args.model_dir if args.model_dir is not None else args.checkpoint_dir[0],
+            "" if args.model_dir is not None else "model",
+            1,
+            None,
+            int(args.tensor_parallel_degree),
+            checkpoint_type=CheckpointingMethod.FULL,
+        )
+
+    if global_rank == 0:
+        train_min = train_sec / 60.0
+        total_min = total_sec / 60.0
+
+        for writer in writers:
+            runtime = {
+                "total": total_min,
+                "train": train_min,
+            }
+            writer.add_scalars("Perf/runtime", runtime, total_steps - 1)
+
+        logger.info(
+            "FSDP training finished successfully %fs (%fmin) out of (%fmin).",
+            train_sec, train_min, total_min
+        )
+
+    dist.destroy_process_group()
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/train_utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/train_utils.py
new file mode 100644
index 00000000..49bd7f64
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/train_utils.py
@@ -0,0 +1,348 @@
+"""Train utils."""
+
+import functools
+
+import numpy as np
+import torch
+
+# pylint: disable=import-error,import-outside-toplevel,invalid-name,no-member,no-name-in-module,protected-access
+import transformers
+from fsdp_utils import get_transformer_layer
+from learning_rates import AnnealingLR  # pylint: disable=wrong-import-order
+from logging_utils import get_logger
+from packaging import version as pversion
+from torch.nn import LayerNorm
+from transformers import AutoModelForCausalLM
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+
+_logger = get_logger()
+
+
+def compute_num_params(model):
+    """Get num params."""
+    num_params = 0
+    seen = set()
+    for p in model.parameters():  # pylint: disable=invalid-name
+        if p not in seen:
+            seen.add(p)
+            if hasattr(p, "ds_shape"):
+                num_params += np.prod(p.ds_shape)
+            else:
+                num_params += np.prod(p.size())
+
+    return num_params
+
+
+def compute_tflops(throughput, num_params, world_size, seq_len):
+    """
+    Compute TFLOPs by using the 6 factor which gives us model tflops.
+    This makes it easier to compare with frameworks such as megatron
+    which may not use activation checkpointing.
+    Using factor 8 gives us hardware tflops when using activation checkpointing.
+
+    Based on the formula in
+    https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/
+    """
+    return 6 * throughput * num_params / world_size * seq_len * 1e-12
+
+
+def get_learning_rate_scheduler(optimizer, args):
+    """Get learning rate scheduler."""
+    use_checkpoint_lr_scheduler = args.resume_from_checkpoint is not None
+
+    # Add linear learning rate scheduler.
+    if args.lr_decay_iters is not None:
+        num_iters = args.lr_decay_iters
+    else:
+        num_iters = args.max_steps
+    num_iters = max(1, num_iters)
+    init_step = 0
+    warmup_iter = args.warmup * num_iters
+    plateau_iter = warmup_iter + args.plateau * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=args.lr,
+        warmup_iter=warmup_iter,
+        plateau_iter=plateau_iter,
+        total_iters=num_iters,
+        decay_style=args.lr_decay_style,
+        last_iter=init_step,
+        min_lr=args.min_lr,
+        use_checkpoint_lr_scheduler=use_checkpoint_lr_scheduler,
+        override_lr_scheduler=False,
+    )
+
+    return lr_scheduler
+
+
+def get_param_groups_by_weight_decay(module):
+    """Get param groups."""
+    weight_decay_params = {"params": []}
+    no_weight_decay_params = {"params": [], "weight_decay": 0.0}
+    param_ids = set()
+
+    for module_ in module.modules():
+        # if isinstance(module_, FusedLayerNorm) or
+        if isinstance(module_, (LayerNorm, LlamaRMSNorm)):
+            for p in list(
+                module_._parameters.values()
+            ):  # pylint: disable=invalid-name,protected-access
+                if p is not None and id(p) not in param_ids:
+                    no_weight_decay_params["params"].append(p)
+                    param_ids.add(id(p))
+        else:
+            for n, p in list(
+                module_._parameters.items()
+            ):  # pylint: disable=invalid-name,protected-access
+                if p is not None and n != "bias" and id(p) not in param_ids:
+                    weight_decay_params["params"].append(p)
+                    param_ids.add(id(p))
+            for n, p in list(
+                module_._parameters.items()
+            ):  # pylint: disable=invalid-name,protected-access
+                if p is not None and n == "bias" and id(p) not in param_ids:
+                    no_weight_decay_params["params"].append(p)
+                    param_ids.add(id(p))
+    return weight_decay_params, no_weight_decay_params
+
+
+def create_model(args, model_config, dtype, pretrained_model_weights=None):
+    """Create model."""
+    if pretrained_model_weights:
+        _logger.info("Loading pretrained weights from %s.", pretrained_model_weights)
+        if pversion.parse(transformers.__version__) < pversion.parse("4.37.1"):
+            model = AutoModelForCausalLM.from_pretrained(pretrained_model_weights)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(pretrained_model_weights, attn_implementation="flash_attention_2")
+    else:
+        if pversion.parse(transformers.__version__) < pversion.parse("4.37.1"):
+            model = AutoModelForCausalLM.from_config(model_config)
+        else:
+            model = AutoModelForCausalLM.from_config(model_config, attn_implementation="flash_attention_2")
+
+    if pversion.parse(transformers.__version__) >= pversion.parse("4.37.1"):
+        args.use_smp_flash_attn = 0
+        _logger.info("For transformers greater than or equal to 4.37.1, automatically use integrated flash attn.")
+
+    if args.use_smp_flash_attn:
+        if args.model_type == "gpt_neox":
+            layout = "b h s d"
+            layers = model.gpt_neox.layers
+            attn_name = "attention"
+        elif args.model_type == "gpt2":
+            layout = "b h s d"
+            layers = model.transformer.h
+            attn_name = "attn"  # Note: Only self attention is referenced
+        elif args.model_type == "llama_v2":
+            layout = "b s h d"
+            layers = model.model.layers
+            attn_name = "self_attn"
+        else:
+            raise ValueError(f"Unsupported model type {args.model_type}")
+
+        def new_attn(
+            self, q, k, v, attention_mask=None, head_mask=None
+        ):  # pylint: disable=too-many-arguments
+            del attention_mask
+            del head_mask
+            attn_weights = None
+            return (
+                self.flashmod((q, k, v), causal=True, cast_dtype=dtype, layout=layout),
+                attn_weights,
+            )
+
+        if args.model_type == "llama_v2":
+            # pre 4.34 we use rubik's class
+            from torch.sagemaker.nn.huggingface.llama_flashattn import LlamaFlashAttention
+
+            flash_attn_class = LlamaFlashAttention
+            for layer in layers:
+                prev_layer = getattr(layer, attn_name)
+                setattr(layer, attn_name, flash_attn_class(model.config))
+                attn_layer = getattr(layer, attn_name)
+                attn_layer.pretraining_tp = model.config.pretraining_tp
+                with torch.no_grad():
+                    attn_layer.q_proj.weight.copy_(prev_layer.q_proj.weight)
+                    attn_layer.k_proj.weight.copy_(prev_layer.k_proj.weight)
+                    attn_layer.v_proj.weight.copy_(prev_layer.v_proj.weight)
+                    attn_layer.o_proj.weight.copy_(prev_layer.o_proj.weight)
+        else:
+            from torch.sagemaker.nn.attn import (  # pylint: disable=no-name-in-module
+                FlashSelfAttention,
+            )
+
+            for layer in layers:
+                getattr(layer, attn_name).flashmod = FlashSelfAttention(attention_dropout_prob=0.0)
+                getattr(layer, attn_name)._attn = functools.partial(
+                    new_attn, getattr(layer, attn_name)
+                )
+
+    return model
+
+
+def get_model_config(args):
+    """Get model config."""
+    if "gpt_neox" in args.model_type:
+        from transformers import GPTNeoXConfig
+
+        model_config = GPTNeoXConfig(
+            vocab_size=args.vocab_size,
+            hidden_size=args.hidden_width,
+            num_hidden_layers=args.num_layers,
+            num_attention_heads=args.num_heads,
+            hidden_act="gelu",
+            intermediate_size=4 * args.hidden_width,
+            rotary_pct=args.rotary_pct,
+            rotary_emb_base=args.rotary_emb_base,
+            max_position_embeddings=args.max_context_width,
+            layer_norm_eps=1e-05,
+            initializer_range=args.initializer_range,
+            use_cache=False,
+            tie_word_embeddings=False,
+            use_parallel_residual=True,
+            attention_dropout=0.0,
+            hidden_dropout=0.0,
+        )
+    elif "gpt2" in args.model_type:
+        from transformers import GPT2Config
+
+        model_config = GPT2Config(
+            vocab_size=args.vocab_size,
+            n_positions=args.max_context_width,
+            n_embd=args.hidden_width,
+            n_layer=args.num_layers,
+            n_head=args.num_heads,
+            n_inner=None,
+            activation_function="gelu_new",
+            resid_pdrop=args.resid_pdrop,
+            embd_pdrop=args.embd_pdrop,
+            attn_pdrop=args.attn_pdrop,
+            layer_norm_epsilon=1e-05,
+            initializer_range=args.initializer_range,
+            summary_type="cls_index",
+            summary_use_proj=True,
+            summary_activation=None,
+            summary_proj_to_labels=True,
+            summary_first_dropout=args.summary_first_pdrop,
+            use_cache=False,
+            bos_token_id=50256,
+            eos_token_id=50256,
+            return_dict=True,
+        )
+    elif "llama_v2" in args.model_type:
+        from transformers import LlamaConfig
+
+        model_config = LlamaConfig(
+            vocab_size=args.vocab_size,
+            hidden_size=args.hidden_width,
+            intermediate_size=args.llama_intermediate_size,
+            num_hidden_layers=args.num_layers,
+            num_attention_heads=args.num_heads,
+            num_key_value_heads=args.num_key_value_heads,
+            hidden_act="silu",
+            max_position_embeddings=args.max_context_width,
+            initializer_range=args.initializer_range,
+            rms_norm_eps=1e-5,
+            use_cache=False,
+            pretraining_tp=1,
+            tie_word_embeddings=False,
+            rope_scaling=None,
+        )
+    elif "mistral" in args.model_type:
+        from transformers import MistralConfig
+
+        model_config = MistralConfig(
+            vocab_size=args.vocab_size, # 32000
+            hidden_size=args.hidden_width, # 4096
+            intermediate_size=args.intermediate_size, # 14336
+            num_hidden_layers=args.num_layers, # 32
+            num_attention_heads=args.num_heads, # 32
+            num_key_value_heads=args.num_key_value_heads, # 8
+            hidden_act="silu",
+            max_position_embeddings=args.max_context_width, # 4096 * 32
+            initializer_range=args.initializer_range, # 0.02
+            rms_norm_eps=1e-6,
+            use_cache=False,
+            pad_token_id=None,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            rope_theta=10000.0,
+            sliding_window=args.sliding_window, # 4096
+            attention_dropout=0.0,
+        )
+    elif "mixtral" in args.model_type:
+        from transformers import MixtralConfig
+
+        model_config = MixtralConfig(
+            vocab_size=args.vocab_size, # 32000,
+            hidden_size=args.hidden_width, # 4096,
+            intermediate_size=args.intermediate_size, # 14336,
+            num_hidden_layers=args.num_layers, # 32,
+            num_attention_heads=args.num_heads, # 32,
+            num_key_value_heads=args.num_key_value_heads, # 8,
+            hidden_act="silu",
+            max_position_embeddings=args.max_context_width, # 4096 * 32,
+            initializer_range=args.initializer_range, # 0.02,
+            rms_norm_eps=1e-5,
+            use_cache=False,
+            pad_token_id=None,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            rope_theta=1e6,
+            sliding_window=args.sliding_window, # None,
+            attention_dropout=0.0,
+            num_experts_per_tok=args.num_experts_per_tok, # 2,
+            num_local_experts=args.num_local_experts, # 8,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+        )
+    else:
+        raise NotImplementedError
+    return model_config
+
+
+def apply_activation_checkpoint(args, model=None):
+    """Apply activation checkpoint."""
+    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+        CheckpointImpl,
+        apply_activation_checkpointing,
+        checkpoint_wrapper,
+    )
+
+    transformer_layer = get_transformer_layer(args.model_type, args.use_smp_implementation)
+    check_fn_gpt = lambda submodule: isinstance(  # pylint: disable=unnecessary-lambda-assignment
+        submodule, transformer_layer
+    )
+
+    if args.fp8==1 and args.use_smp_implementation==1:
+        import transformer_engine
+        import torch.sagemaker as tsm
+        checkpoint_fn = functools.partial(
+            transformer_engine.pytorch.checkpoint,
+            distribute_saved_activations=False,
+            get_cuda_rng_tracker=tsm.state.get_rng_state_tracker,
+            tp_group=tsm.state.tp_process_group,
+        )
+        checkpoint_impl = CheckpointImpl.NO_REENTRANT
+    else:
+        checkpoint_fn = None
+        checkpoint_impl=CheckpointImpl.REENTRANT
+
+    # flash attn v2 does not work with no_reentrant
+    # our activation offloading for 2.0 also does not work with no_reentrant
+    entrant_wrapper = functools.partial(
+        checkpoint_wrapper, checkpoint_impl=checkpoint_impl, checkpoint_fn=checkpoint_fn
+    )
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=entrant_wrapper, check_fn=check_fn_gpt
+    )
+
+
+def patch_neox_rope(model):
+    """Patch neox rope."""
+    device = torch.cuda.current_device()
+    for layer in model.gpt_neox.layers:
+        layer.attention.rotary_emb.sin_cached = layer.attention.rotary_emb.sin_cached.to(device)
+        layer.attention.rotary_emb.cos_cached = layer.attention.rotary_emb.cos_cached.to(device)
diff --git a/3.test_cases/17.SM-modelparallelv2/scripts/utils.py b/3.test_cases/17.SM-modelparallelv2/scripts/utils.py
new file mode 100644
index 00000000..233e118f
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/scripts/utils.py
@@ -0,0 +1,76 @@
+"""Utils."""
+
+import os
+import re
+import subprocess
+from typing import Any, Dict, Optional
+
+import numpy
+
+
+def parse_nccl_test_log(log_file: str) -> Optional[numpy.ndarray]:
+    """Parse NCCL test log file.
+
+    Sample output with 2 nodes:
+
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 29.8872
+    #
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 28.9057
+    #
+    """
+    try:
+        with subprocess.Popen(
+            f"grep 'Avg bus bandwidth' {log_file}",
+            shell=True, stdout=subprocess.PIPE, encoding="UTF-8",
+        ) as pipe:
+            result, _ = pipe.communicate()
+    except Exception as _:
+        return None
+
+    bandwidth = []
+    for line in str(result).split(os.linesep):
+        line = line.strip()
+        if not line:
+            continue
+        splits = line.split(":")
+        if len(splits) == 2 and re.match(r"^\d+\.\d*$", splits[-1].strip()):
+            bandwidth.append(float(splits[-1].strip()))
+
+    return numpy.array(bandwidth)
+
+
+def get_nccl_test_report(bandwidth: Optional[numpy.ndarray]) -> Optional[Dict[str, Any]]:
+    """Get the complete NCCL test report."""
+    if bandwidth is None:
+        return None
+
+    bandwidth = bandwidth.reshape((-1,))
+    size = len(bandwidth)
+    if not size:
+        return None
+
+    data_sorted = numpy.sort(bandwidth)
+    report = {
+        "data": bandwidth,
+        "data_sorted": data_sorted,
+        "len": size,
+        # Stats.
+        "max": numpy.max(bandwidth),
+        "mean": numpy.mean(bandwidth),
+        "median": numpy.median(bandwidth),
+        "min": numpy.min(bandwidth),
+        "std": numpy.std(bandwidth),
+    }
+
+    for index in range(2, 6):
+        if size < index:
+            break
+
+        report.update({
+            f"max{index}": data_sorted[-index],
+            f"min{index}": data_sorted[index - 1],
+        })
+
+    return report
diff --git a/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh b/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
new file mode 100644
index 00000000..0d636b2a
--- /dev/null
+++ b/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
@@ -0,0 +1,89 @@
+# specify which CUDA version you are using
+SMP_CUDA_VER=12.1
+
+directory="$(pwd)/miniconda3"
+
+if [ ! -d "$directory" ]; then
+    echo "Miniconda does not exist.Downloading......"
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    chmod +x Miniconda3-latest-Linux-x86_64.sh
+    ./Miniconda3-latest-Linux-x86_64.sh -b -f -p ./miniconda3
+else
+    echo "Miniconda exists...."
+fi
+
+source ./miniconda3/bin/activate
+
+export ENV_PATH=./miniconda3/envs/smpv2
+
+conda create -p ${ENV_PATH} python=3.10
+
+conda activate ${ENV_PATH}
+
+
+conda install "aws-ofi-nccl >=1.7.1,<2.0" packaging --override-channels \
+  -c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
+  -c pytorch -c numba/label/dev \
+  -c nvidia \
+  -c conda-forge \
+
+conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.2_cuda12.1_0" packaging --override-channels \
+  -c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
+  -c pytorch -c numba/label/dev \
+  -c pytorch-nightly -c nvidia -c conda-forge
+
+
+# Install dependencies of the script as below
+
+python -m pip install --no-cache-dir -U \
+    "transformers==4.37.1" \
+    "accelerate==0.28.0" \
+    "triton==2.2.0" \
+    "SentencePiece==0.1.99" \
+    "datasets==2.16.1" \
+    "expecttest" \
+    "parameterized==0.9.0" \
+    "protobuf==3.20.3" \
+    "pytest-repeat==0.9.1" \
+    "pytest==7.4.0" \
+    "tensorboard==2.13.0" \
+    "tqdm==4.65.0"
+
+  MAX_JOBS=128 pip install flash-attn==2.3.3 --no-build-isolation
+
+
+# Install SMDDP wheel
+
+RUN SMDDP_WHL="smdistributed_dataparallel-2.2.0-cp310-cp310-linux_x86_64.whl" \
+  && wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.2.0/cu121/2024-03-04/${SMDDP_WHL} \
+  && pip install --force ${SMDDP_WHL} \
+  && rm ${SMDDP_WHL}
+
+
+if [ $SMP_CUDA_VER == "11.8" ]; then
+    # cuDNN installation for TransformerEngine installation for cuda11.8
+    tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+        && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
+else
+    # cuDNN installation for TransformerEngine installation for cuda12.1
+    tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+        && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
+fi
+
+# TransformerEngine installation
+export CUDA_HOME=/usr/local/cuda-$SMP_CUDA_VER
+export CUDNN_PATH=/usr/local/cuda-$SMP_CUDA_VER/lib
+export CUDNN_LIBRARY=/usr/local/cuda-$SMP_CUDA_VER/lib
+export CUDNN_INCLUDE_DIR=/usr/local/cuda-$SMP_CUDA_VER/include
+export PATH=/usr/local/cuda-$SMP_CUDA_VER/bin:$PATH
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-$SMP_CUDA_VER/lib
+
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@v1.2.1

From cc9dc66713626e1bb6b0fdb038774c074925a165 Mon Sep 17 00:00:00 2001
From: Arun Lokanatha <aruncs2005@gmail.com>
Date: Thu, 28 Mar 2024 11:03:28 -0700
Subject: [PATCH 2/2] Added comments to conda setup scripts

---
 3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh b/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
index 0d636b2a..4acb6750 100644
--- a/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
+++ b/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
@@ -20,13 +20,14 @@ conda create -p ${ENV_PATH} python=3.10
 
 conda activate ${ENV_PATH}
 
-
+# Install OFI nccl 
 conda install "aws-ofi-nccl >=1.7.1,<2.0" packaging --override-channels \
   -c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
   -c pytorch -c numba/label/dev \
   -c nvidia \
   -c conda-forge \
 
+# Install SMP V2 pytorch. We will install SMP with pytorch 2.2
 conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.2_cuda12.1_0" packaging --override-channels \
   -c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
   -c pytorch -c numba/label/dev \
@@ -52,7 +53,7 @@ python -m pip install --no-cache-dir -U \
   MAX_JOBS=128 pip install flash-attn==2.3.3 --no-build-isolation
 
 
-# Install SMDDP wheel
+# Install SMDDP
 
 RUN SMDDP_WHL="smdistributed_dataparallel-2.2.0-cp310-cp310-linux_x86_64.whl" \
   && wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.2.0/cu121/2024-03-04/${SMDDP_WHL} \