Skip to content

Commit

Permalink
Merge pull request #277 from aruncs2005/main
Browse files Browse the repository at this point in the history
Fixed conda setup failure for SMP.
  • Loading branch information
aruncs2005 authored Apr 22, 2024
2 parents 6f05309 + 231da90 commit 78d9066
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 88 deletions.
86 changes: 0 additions & 86 deletions 3.test_cases/17.SM-modelparallelv2/conda_env_setup.sh

This file was deleted.

11 changes: 9 additions & 2 deletions 3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,21 @@ conda create -p ${ENV_PATH} python=3.10

conda activate ${ENV_PATH}


# Install OFI nccl
conda install "aws-ofi-nccl >=1.7.1,<2.0" packaging --override-channels \
conda install "aws-ofi-nccl==1.7.4" packaging --override-channels \
-c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
-c pytorch -c numba/label/dev \
-c nvidia \
-c conda-forge \

conda install -c conda-forge mkl=2023.1.0
conda install "requests==2.28.2"
conda install "filelock==3.9.0"
conda install "sympy==1.12"

# Install SMP V2 pytorch. We will install SMP with pytorch 2.2
conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.2_cuda12.1_0" packaging --override-channels \
conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0" packaging --override-channels \
-c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
-c pytorch -c numba/label/dev \
-c pytorch-nightly -c nvidia -c conda-forge
Expand All @@ -50,6 +56,7 @@ python -m pip install --no-cache-dir -U \
"tensorboard==2.13.0" \
"tqdm==4.65.0"

pip install megatron-core==0.5.0

pip uninstall -y ninja && pip install ninja

Expand Down

0 comments on commit 78d9066

Please sign in to comment.