Skip to content

Commit

Permalink
Upgrade packages + small improvements (#310)
Browse files Browse the repository at this point in the history
Signed-off-by: Igor Gitman <[email protected]>
  • Loading branch information
Kipok authored Jan 9, 2025
1 parent 6c4c59e commit 7bb30c8
Show file tree
Hide file tree
Showing 34 changed files with 1,267 additions and 360 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- name: Cleanup test directory
if: always()
run: |
docker run --rm -v /tmp:/tmp igitman/nemo-skills-nemo:0.4.2 bash -c 'rm -rf /tmp/nemo-skills-tests/llama'
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
gpu-tests-qwen:
runs-on: self-hosted-nemo-gpus-1
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
- name: Cleanup test directory
if: always()
run: |
docker run --rm -v /tmp:/tmp igitman/nemo-skills-nemo:0.4.2 bash -c 'rm -rf /tmp/nemo-skills-tests/qwen'
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
gpu-tests-rm:
runs-on: self-hosted-nemo-gpus-1
Expand Down Expand Up @@ -118,4 +118,4 @@ jobs:
- name: Cleanup test directory
if: always()
run: |
docker run --rm -v /tmp:/tmp igitman/nemo-skills-nemo:0.4.2 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb'
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
env:
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
run: |
docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.4.2 &
docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.5.0 &
sleep 120
export NEMO_SKILLS_SANDBOX_HOST=`docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' local-sandbox`
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
Expand Down
10 changes: 5 additions & 5 deletions cluster_configs/example-local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
executor: local

containers:
trtllm: igitman/nemo-skills-trtllm:0.4.2
vllm: igitman/nemo-skills-vllm:0.4.2
nemo: igitman/nemo-skills-nemo:0.4.2
sandbox: igitman/nemo-skills-sandbox:0.4.2
nemo-skills: igitman/nemo-skills:0.4.2
trtllm: igitman/nemo-skills-trtllm:0.5.0
vllm: igitman/nemo-skills-vllm:0.5.0
nemo: igitman/nemo-skills-nemo:0.5.0
sandbox: igitman/nemo-skills-sandbox:0.5.0
nemo-skills: igitman/nemo-skills:0.5.0

# add required mounts for models/data here
# the code is mounted automatically inside /nemo_run/code
Expand Down
10 changes: 5 additions & 5 deletions cluster_configs/example-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
executor: slurm

containers:
trtllm: igitman/nemo-skills-trtllm:0.4.2
vllm: igitman/nemo-skills-vllm:0.4.2
nemo: igitman/nemo-skills-nemo:0.4.2
sandbox: igitman/nemo-skills-sandbox:0.4.2
nemo-skills: igitman/nemo-skills:0.4.2
trtllm: igitman/nemo-skills-trtllm:0.5.0
vllm: igitman/nemo-skills-vllm:0.5.0
nemo: igitman/nemo-skills-nemo:0.5.0
sandbox: igitman/nemo-skills-sandbox:0.5.0
nemo-skills: igitman/nemo-skills:0.5.0

job_name_prefix: "nemo_skills:"

Expand Down
108 changes: 60 additions & 48 deletions dockerfiles/Dockerfile.nemo
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# To update NeMo-Aligner from a pre-built NeMo-Framework container:
#
# docker buildx build --target=aligner-bump --build-arg=BASE_IMAGE=nvcr.io/nvidia/nemo:24.07 -t aligner:latest .
# docker buildx build --target=aligner-bump -t aligner:latest .
#

# Number of parallel threads for compute heavy build jobs
Expand All @@ -30,13 +30,12 @@ ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10
ARG NEMO_TAG=e033481e26e6ae32764d3e2b3f16afed00dc7218 # On: r2.0.0rc1
ARG MLM_TAG=a3fe0c75df82218901fa2c3a7c9e389aa5f53182 # On: core_r0.8.0
ARG ALIGNER_COMMIT=73e6ee16d207e889a36ef3bee27349edad188678 # ON: main
ARG TRTLLM_VERSION=v0.10.0
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG ALIGNER_COMMIT=35fcfd9df754aff56f71cb3ba3382cc02384361a
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3

FROM ${BASE_IMAGE} AS aligner-bump
ARG ALIGNER_COMMIT
Expand All @@ -53,13 +52,41 @@ git checkout -f $ALIGNER_COMMIT
# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
git pull --rebase || true

pip install --no-deps -e .
pip install --no-cache-dir --no-deps -e .
EOF

FROM ${BASE_IMAGE} as final
LABEL "nemo.library"="nemo-aligner"
WORKDIR /opt
# needed in case git complains that it can't detect a valid email, this email is fake but works
RUN git config --global user.email "[email protected]"
# install latest apex
ARG APEX_TAG
RUN pip uninstall -y apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
if [ ! -z $APEX_TAG ]; then \
git fetch origin $APEX_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
apt-get install git-lfs && \
git lfs install && \
apt-get clean

# TRTLLM
ARG TRTLLM_VERSION
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
cd TensorRT-LLM && \
git checkout ${TRTLLM_VERSION} && \
. docker/common/install_tensorrt.sh && \
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \
pip install -e .
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/

# install TransformerEngine
ARG MAX_JOBS
ARG TE_TAG
Expand All @@ -73,27 +100,14 @@ RUN pip uninstall -y transformer-engine && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

# install latest apex
ARG APEX_TAG
RUN pip uninstall -y apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
if [ ! -z $APEX_TAG ]; then \
git fetch origin $APEX_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# place any util pkgs here
ARG PYTRITON_VERSION
RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
ARG PROTOBUF_VERSION
RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
RUN pip install --upgrade-strategy only-if-needed jsonlines

# NeMo (with cherry-picked https://github.com/NVIDIA/NeMo/pull/10785)
# Also patching a bug with hf-dataset cast to int
COPY dockerfiles/nemo.patch /opt/nemo.patch
# NeMo
ARG NEMO_TAG
RUN git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
Expand All @@ -102,8 +116,6 @@ RUN git clone https://github.com/NVIDIA/NeMo.git && \
git fetch origin $NEMO_TAG && \
git checkout FETCH_HEAD; \
fi && \
git cherry-pick 8949924d7e28dacfa5a573ee1e92c64cf8beb7c9 && \
patch -p1 < /opt/nemo.patch && \
pip uninstall -y nemo_toolkit sacrebleu && \
pip install -e ".[nlp]" && \
cd nemo/collections/nlp/data/language_modeling/megatron && make
Expand All @@ -120,32 +132,32 @@ RUN pip uninstall -y megatron-core && \
fi && \
pip install -e .

# Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
apt-get install git-lfs && \
git lfs install

COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
RUN cd /opt/NeMo-Aligner && \
pip install --no-deps -e .

# TRTLLM
ARG TRTLLM_VERSION
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
cd TensorRT-LLM && \
git checkout ${TRTLLM_VERSION} && \
patch -p1 < ../NeMo-Aligner/setup/trtllm.patch && \
. docker/common/install_tensorrt.sh && \
python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt

RUN cd TensorRT-LLM && \
pip install ./build/tensorrt_llm*.whl
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/

# WAR(0.4.0): The pin of NeMo requires a higher nvidia-modelopt version than
# TRT-LLM allows. This installation must follow TRT-LLM and is
# only necessary when NeMo 2.0.0rc1 is installed with TRT-LLM v10.
RUN pip install --upgrade-strategy only-if-needed nvidia-modelopt==0.13.0
RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch

# patching transformers as it has a bug for conversion
RUN pip install --upgrade transformers>=4.45.2
# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
RUN <<"EOF" bash -exu
cd NeMo
# Ensures we don't cherry-pick "future" origin/main commits
git fetch -a
# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
for pr_and_commit in \
"10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
"10652 60e677423667c029dd05875da72bf0719774f844" \
"10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
; do
pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
git fetch origin $head_pr_commit:PR-${pr}
# cherry-picks all commits between main and the top of the PR
git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
# Tag cherry-picks to help
git tag cherry-pick-PR-${pr}
done
EOF
3 changes: 0 additions & 3 deletions dockerfiles/Dockerfile.sandbox
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,3 @@ ARG UWSGI_PROCESSES
ENV UWSGI_PROCESSES=$UWSGI_PROCESSES

ENV LISTEN_PORT=6000

# Expose the necessary port for the Flask app
EXPOSE 6000
2 changes: 1 addition & 1 deletion dockerfiles/Dockerfile.tensorrt_llm
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ RUN ["ln", "-sf", "/usr/bin/python3", "/usr/bin/python"]
RUN ["ln", "-sf", "/usr/bin/pip3", "/usr/bin/pip"]

# pinning to the tested version
RUN pip install tensorrt_llm==0.13.0 -U --extra-index-url https://pypi.nvidia.com
RUN pip install tensorrt_llm==0.17.0.dev2024121700 -U --extra-index-url https://pypi.nvidia.com

# installing packages required for our server code
RUN pip install uvicorn fastapi
Expand Down
3 changes: 0 additions & 3 deletions dockerfiles/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,3 @@ ENV HF_HOME=/cache/huggingface
ENV TRANSFORMERS_CACHE=/cache/huggingface/transformers
ENV HUGGINGFACE_HUB_CACHE=/cache/huggingface/hub
ENV HF_DATASETS_CACHE=/cache/huggingface/datasets

# Expose port for vLLM API
EXPOSE 5000
13 changes: 9 additions & 4 deletions docs/pipelines/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,9 @@ from nemo_skills.pipeline.cli import train, convert, eval
expname = "my-training-job"
cluster = "slurm"
output_dir = f"/workspace/{expname}/checkpoints"
exp = None

train(
exp = train(
ctx=wrap_arguments(""),
cluster=cluster,
expname=expname,
Expand All @@ -128,9 +129,10 @@ train(
num_gpus=8,
num_training_jobs=4,
training_data="/data/sft-data.jsonl",
reuse_code_exp=exp,
)

convert(
exp = convert(
ctx=wrap_arguments(""),
cluster=cluster,
input_model=f"{output_dir}/model-averaged-nemo",
Expand All @@ -142,9 +144,10 @@ convert(
model_type="llama",
num_gpus=8,
hf_model_name="meta-llama/Meta-Llama-3.1-8B",
reuse_code_exp=exp,
)

convert(
exp = convert(
ctx=wrap_arguments(""),
cluster=cluster,
input_model=f"{output_dir}/model-averaged-hf",
Expand All @@ -155,9 +158,10 @@ convert(
convert_to="trtllm",
model_type="llama",
num_gpus=8,
reuse_code_exp=exp,
)

eval(
exp = eval(
ctx=wrap_arguments("++prompt_template=llama3-instruct ++batch_size=512"),
cluster=cluster,
model=f"{output_dir}/model-averaged-trtllm",
Expand All @@ -166,5 +170,6 @@ eval(
benchmarks="gsm8k:0,math:0",
server_gpus=8,
run_after=f"{expname}-to-trtllm",
reuse_code_exp=exp,
)
```
2 changes: 1 addition & 1 deletion nemo_skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = '0.4.2'
__version__ = '0.5.0'
Loading

0 comments on commit 7bb30c8

Please sign in to comment.