From 71e25cd2e0f8a371397a0f07b980187aa00810f6 Mon Sep 17 00:00:00 2001 From: trias702 Date: Tue, 5 Dec 2023 13:34:37 -0800 Subject: [PATCH] add stable dockerfile Signed-off-by: Daniel Egert Signed-off-by: Gerald Shen --- CHANGELOG.md | 1 + Dockerfile | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 5 +++- 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a6e2cf42..358b12598 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Next Version] ### New features and optimizations +- Added public-facing official Dockerfile for NeMo-Aligner ### Breaking changes diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..010b960ab --- /dev/null +++ b/Dockerfile @@ -0,0 +1,76 @@ +# CUDA 12.2 +FROM nvcr.io/nvidia/pytorch:23.10-py3 + +### config tags +ARG APEX_TAG=master +ARG TE_TAG=release_v1.1 +ARG MLM_TAG=core_r0.4.0 +ARG NEMO_TAG=r1.22.0 +ARG PYTRITON_VERSION=0.4.1 +ARG PROTOBUF_VERSION=4.24.4 + +# if you get errors building TE or Apex, decrease this to 4 +ARG MAX_JOBS=8 + +# needed in case git complains that it can't detect a valid email, this email is fake but works +RUN git config --global user.email "worker@nvidia.com" + +WORKDIR /opt + +# install TransformerEngine +RUN pip uninstall -y transformer-engine && \ + git clone https://github.com/NVIDIA/TransformerEngine.git && \ + cd TransformerEngine && \ + if [ ! -z $TE_TAG ]; then \ + git fetch origin $TE_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + git submodule init && git submodule update && \ + NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . + +# install latest apex +RUN pip uninstall -y apex && \ + git clone https://github.com/NVIDIA/apex && \ + cd apex && \ + if [ ! -z $APEX_TAG ]; then \ + git fetch origin $APEX_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + pip install install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ + +# place any util pkgs here +RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION +RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION +RUN pip install --upgrade-strategy only-if-needed jsonlines + +# NeMo +RUN git clone https://github.com/NVIDIA/NeMo.git && \ + cd NeMo && \ + git pull && \ + if [ ! -z $NEMO_TAG ]; then \ + git fetch origin $NEMO_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + pip uninstall -y nemo_toolkit sacrebleu && \ + git cherry-pick --no-commit -X theirs fa8d416793d850f4ce56bea65e1fe28cc0d092c0 a7f0bc1903493888c31436efc2452ff721fa5a67 && \ + sed -i 's/shutil.rmtree(ckpt_to_dir(filepath))/shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True)/g' nemo/collections/nlp/parts/nlp_overrides.py && \ + rm -rf .git && pip install -e ".[nlp]" && \ + cd nemo/collections/nlp/data/language_modeling/megatron && make + +# MLM +RUN pip uninstall -y megatron-core && \ + git clone https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && \ + git pull && \ + if [ ! -z $MLM_TAG ]; then \ + git fetch origin $MLM_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + pip install -e . + +WORKDIR /opt + +# install the latest NeMo-Aligner +RUN pip install --no-deps git+https://github.com/NVIDIA/NeMo-Aligner.git@main + +WORKDIR /workspace diff --git a/README.md b/README.md index cfe7269e2..80edbb409 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,10 @@ pip install . ### Docker Containers -To build your own, refer to the [NeMo Dockerfile](https://github.com/NVIDIA/NeMo/blob/main/Dockerfile) and add `RUN pip install nemo-aligner` at the end. +We provide an official NeMo-Aligner Dockerfile which is based on stable, tested versions of NeMo, Megatron-LM, and TransformerEngine. The goal of this Dockerfile +is stability, so it may not track the very latest versions of those 3 packages. You can access our Dockerfile [here](https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile) + +Alternatively, you can build the NeMo Dockerfile here [NeMo Dockerfile](https://github.com/NVIDIA/NeMo/blob/main/Dockerfile) and add `RUN pip install nemo-aligner` at the end. ## Future work - Add Rejection Sampling support