merged with main

NVIDIA · Jan 14, 2025 · 0bfe6f0 · 0bfe6f0
2 parents 3dd9e54 + 7bb30c8
commit 0bfe6f0
Show file tree

Hide file tree

Showing 58 changed files with 1,634 additions and 479 deletions.
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -46,6 +46,10 @@ jobs:
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_llama.sh
+    - name: Cleanup test directory
+      if: always()
+      run: |
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
 
   gpu-tests-qwen:
     runs-on: self-hosted-nemo-gpus-1
@@ -77,7 +81,11 @@ jobs:
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_qwen.sh
-  
+    - name: Cleanup test directory
+      if: always()
+      run: |
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
+
   gpu-tests-rm:
     runs-on: self-hosted-nemo-gpus-1
     if: ${{ github.event.label.name == 'run GPU tests' }}
@@ -107,3 +115,7 @@ jobs:
         nvidia-smi
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ./tests/gpu-tests/run_rm.sh
+    - name: Cleanup test directory
+      if: always()
+      run: |
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.5.0 bash -c 'rm -rf /tmp/nemo-skills-tests/mistral_emb /home/azureuser/.nemo_run/'
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -33,7 +33,7 @@ jobs:
       env:
         NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
       run: |
-        docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.4.2 &
+        docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.5.0 &
         sleep 120
         export NEMO_SKILLS_SANDBOX_HOST=`docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' local-sandbox`
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail

diff --git a/README.md b/README.md
@@ -9,9 +9,10 @@ Here are some of the things we support.
   and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) servers and easily convert checkpoints from one format to another.
 - [Model evaluation](https://nvidia.github.io/NeMo-Skills/pipelines/evaluation): Evaluate your models on many popular benchmarks
     - Math problem solving: gsm8k, math, amc23, aime24, omni-math (and many more)
+    - Formal proofs in Lean: minif2f, proofnet
     - Coding skills: human-eval, mbpp
-    - Chat/instruction following: ifeval, arena-hard
-    - General knowledge: mmlu (generative)
+    - Chat/instruction following: ifeval, arena-hard, mt-bench
+    - General knowledge: mmlu (generative), mmlu-pro
 - [Model training](https://nvidia.github.io/NeMo-Skills/pipelines/training): Train models at speed-of-light using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/).
 
 You can find the full documentation [here](https://nvidia.github.io/NeMo-Skills/).

diff --git a/cluster_configs/example-local.yaml b/cluster_configs/example-local.yaml
@@ -15,11 +15,11 @@
 executor: local
 
 containers:
-  trtllm: igitman/nemo-skills-trtllm:0.4.2
-  vllm: igitman/nemo-skills-vllm:0.4.2
-  nemo: igitman/nemo-skills-nemo:0.4.2
-  sandbox: igitman/nemo-skills-sandbox:0.4.2
-  nemo-skills: igitman/nemo-skills:0.4.2
+  trtllm: igitman/nemo-skills-trtllm:0.5.0
+  vllm: igitman/nemo-skills-vllm:0.5.0
+  nemo: igitman/nemo-skills-nemo:0.5.0
+  sandbox: igitman/nemo-skills-sandbox:0.5.0
+  nemo-skills: igitman/nemo-skills:0.5.0
 
 # add required mounts for models/data here
 # the code is mounted automatically inside /nemo_run/code

diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml
@@ -15,11 +15,11 @@
 executor: slurm
 
 containers:
-  trtllm: igitman/nemo-skills-trtllm:0.4.2
-  vllm: igitman/nemo-skills-vllm:0.4.2
-  nemo: igitman/nemo-skills-nemo:0.4.2
-  sandbox: igitman/nemo-skills-sandbox:0.4.2
-  nemo-skills: igitman/nemo-skills:0.4.2
+  trtllm: igitman/nemo-skills-trtllm:0.5.0
+  vllm: igitman/nemo-skills-vllm:0.5.0
+  nemo: igitman/nemo-skills-nemo:0.5.0
+  sandbox: igitman/nemo-skills-sandbox:0.5.0
+  nemo-skills: igitman/nemo-skills:0.5.0
 
 job_name_prefix: "nemo_skills:"
 

diff --git a/dockerfiles/Dockerfile.nemo b/dockerfiles/Dockerfile.nemo
@@ -21,7 +21,7 @@
 #
 # To update NeMo-Aligner from a pre-built NeMo-Framework container:
 #
-#   docker buildx build --target=aligner-bump --build-arg=BASE_IMAGE=nvcr.io/nvidia/nemo:24.07 -t aligner:latest .
+#   docker buildx build --target=aligner-bump -t aligner:latest .
 #
 
 # Number of parallel threads for compute heavy build jobs
@@ -30,13 +30,12 @@ ARG MAX_JOBS=8
 # Git refs for dependencies
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG PYTRITON_VERSION=0.5.10
-ARG NEMO_TAG=e033481e26e6ae32764d3e2b3f16afed00dc7218  # On: r2.0.0rc1
-ARG MLM_TAG=a3fe0c75df82218901fa2c3a7c9e389aa5f53182  # On: core_r0.8.0
-ARG ALIGNER_COMMIT=73e6ee16d207e889a36ef3bee27349edad188678 # ON: main
-ARG TRTLLM_VERSION=v0.10.0
+ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
+ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG ALIGNER_COMMIT=35fcfd9df754aff56f71cb3ba3382cc02384361a
+ARG TRTLLM_VERSION=v0.13.0
 ARG PROTOBUF_VERSION=4.24.4
-
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
 
 FROM ${BASE_IMAGE} AS aligner-bump
 ARG ALIGNER_COMMIT
@@ -53,13 +52,41 @@ git checkout -f $ALIGNER_COMMIT
 # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
 git pull --rebase || true
 
-pip install --no-deps -e .
+pip install --no-cache-dir --no-deps -e .
 EOF
 
 FROM ${BASE_IMAGE} as final
+LABEL "nemo.library"="nemo-aligner"
 WORKDIR /opt
 # needed in case git complains that it can't detect a valid email, this email is fake but works
 RUN git config --global user.email "[email protected]"
+# install latest apex
+ARG APEX_TAG
+RUN pip uninstall -y apex && \
+    git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    if [ ! -z $APEX_TAG ]; then \
+        git fetch origin $APEX_TAG && \
+        git checkout FETCH_HEAD; \
+    fi && \
+    pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+
+# Git LFS
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
+    apt-get install git-lfs && \
+    git lfs install && \
+    apt-get clean
+
+# TRTLLM
+ARG TRTLLM_VERSION
+RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
+    cd TensorRT-LLM && \
+    git checkout ${TRTLLM_VERSION} && \
+    . docker/common/install_tensorrt.sh && \
+    python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt  --python_bindings --benchmarks && \
+    pip install -e .
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
+
 # install TransformerEngine
 ARG MAX_JOBS
 ARG TE_TAG
@@ -73,27 +100,14 @@ RUN pip uninstall -y transformer-engine && \
     git submodule init && git submodule update && \
     NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
 
-# install latest apex
-ARG APEX_TAG
-RUN pip uninstall -y apex && \
-    git clone https://github.com/NVIDIA/apex && \
-    cd apex && \
-    if [ ! -z $APEX_TAG ]; then \
-        git fetch origin $APEX_TAG && \
-        git checkout FETCH_HEAD; \
-    fi && \
-    pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
-
 # place any util pkgs here
 ARG PYTRITON_VERSION
 RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
 ARG PROTOBUF_VERSION
 RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
 RUN pip install --upgrade-strategy only-if-needed jsonlines
 
-# NeMo (with cherry-picked https://github.com/NVIDIA/NeMo/pull/10785)
-# Also patching a bug with hf-dataset cast to int
-COPY dockerfiles/nemo.patch /opt/nemo.patch
+# NeMo
 ARG NEMO_TAG
 RUN git clone https://github.com/NVIDIA/NeMo.git && \
     cd NeMo && \
@@ -102,8 +116,6 @@ RUN git clone https://github.com/NVIDIA/NeMo.git && \
         git fetch origin $NEMO_TAG && \
         git checkout FETCH_HEAD; \
     fi && \
-    git cherry-pick 8949924d7e28dacfa5a573ee1e92c64cf8beb7c9 && \
-    patch -p1 < /opt/nemo.patch && \
     pip uninstall -y nemo_toolkit sacrebleu && \
     pip install -e ".[nlp]" && \
     cd nemo/collections/nlp/data/language_modeling/megatron && make
@@ -120,32 +132,32 @@ RUN pip uninstall -y megatron-core && \
     fi && \
     pip install -e .
 
-# Git LFS
-RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
-    apt-get install git-lfs && \
-    git lfs install
-
 COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
 RUN cd /opt/NeMo-Aligner && \
     pip install --no-deps -e .
 
-# TRTLLM
-ARG TRTLLM_VERSION
-RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
-    cd TensorRT-LLM && \
-    git checkout ${TRTLLM_VERSION} && \
-    patch -p1 < ../NeMo-Aligner/setup/trtllm.patch && \
-    . docker/common/install_tensorrt.sh && \
-    python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt
-
-RUN cd TensorRT-LLM && \
-    pip install ./build/tensorrt_llm*.whl
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
-
-# WAR(0.4.0): The pin of NeMo requires a higher nvidia-modelopt version than
-#             TRT-LLM allows. This installation must follow TRT-LLM and is
-#             only necessary when NeMo 2.0.0rc1 is installed with TRT-LLM v10.
-RUN pip install --upgrade-strategy only-if-needed nvidia-modelopt==0.13.0
+RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
-# patching transformers as it has a bug for conversion
-RUN pip install --upgrade transformers>=4.45.2
+# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
+RUN <<"EOF" bash -exu
+cd NeMo
+# Ensures we don't cherry-pick "future" origin/main commits
+git fetch -a
+# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
+# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
+# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
+# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+for pr_and_commit in \
+  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
+  "10652 60e677423667c029dd05875da72bf0719774f844" \
+  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+; do
+  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
+  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
+  git fetch origin $head_pr_commit:PR-${pr}
+  # cherry-picks all commits between main and the top of the PR
+  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
+  # Tag cherry-picks to help
+  git tag cherry-pick-PR-${pr}
+done
+EOF
diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox
@@ -58,6 +58,3 @@ ARG UWSGI_PROCESSES
 ENV UWSGI_PROCESSES=$UWSGI_PROCESSES
 
 ENV LISTEN_PORT=6000
-
-# Expose the necessary port for the Flask app
-EXPOSE 6000
diff --git a/dockerfiles/Dockerfile.tensorrt_llm b/dockerfiles/Dockerfile.tensorrt_llm
@@ -23,7 +23,7 @@ RUN ["ln", "-sf", "/usr/bin/python3", "/usr/bin/python"]
 RUN ["ln", "-sf", "/usr/bin/pip3", "/usr/bin/pip"]
 
 # pinning to the tested version
-RUN pip install tensorrt_llm==0.13.0 -U --extra-index-url https://pypi.nvidia.com
+RUN pip install tensorrt_llm==0.17.0.dev2024121700 -U --extra-index-url https://pypi.nvidia.com
 
 # installing packages required for our server code
 RUN pip install uvicorn fastapi

diff --git a/dockerfiles/Dockerfile.vllm b/dockerfiles/Dockerfile.vllm
@@ -55,6 +55,3 @@ ENV HF_HOME=/cache/huggingface
 ENV TRANSFORMERS_CACHE=/cache/huggingface/transformers
 ENV HUGGINGFACE_HUB_CACHE=/cache/huggingface/hub
 ENV HF_DATASETS_CACHE=/cache/huggingface/datasets
-
-# Expose port for vLLM API
-EXPOSE 5000
diff --git a/docs/basics/inference.md b/docs/basics/inference.md
@@ -103,11 +103,11 @@ Click on :material-plus-circle: symbols in the snippet below to learn more detai
          or [create your own prompts](prompt-format.md)
 
 
-    2.   This should print
+    3.   This should print
 
          ```python-console
          >>> print(prompts[0])
-         [{'role': 'system', 'content': ''}, {'role': 'user', 'content': "What's 2 + 2?"}]
+         [{'role': 'user', 'content': "What's 2 + 2?"}]
          ```
 
          If you don't want to use our prompt class, just create this list yourself

diff --git a/docs/basics/prompt-format.md b/docs/basics/prompt-format.md
@@ -1,11 +1,5 @@
 # Prompt utilities
 
-!!! note
-
-    While some of the sections below mention multi-turn prompts, we don't actually
-    support them at the moment. This is mainly because we don't have a real use-case for multi-turn
-    conversations in our work. Please open an issue if you need to use multi-turn prompts.
-
 Our prompts are configured via two input yaml files: prompt template and prompt config.
 
 ## Prompt template
@@ -147,14 +141,70 @@ which outputs
 ```python-console
 [
   {
-    'role': 'system',
-    'content': ''
+    'role': 'user',
+    'content': "Solve the following math problem. Make sure to put the answer (and only answer) inside \\boxed{}.\n\nWhat's 2 + 2?"
+  }
+]
+```
+
+You can also have a look at the [tests](https://github.com/NVIDIA/NeMo-Skills/tree/main/tests/test_prompts.py) to see more examples of using our prompt API.
+
+
+## Multi-turn prompts
+
+If your data is naturally multi-turn (e.g. user-assistant conversations), you can use a special parameter `multi_turn_key` to format
+all conversation together. It can be of any length, as long as each entry except last has a special `assistant` key. The prompt config
+will be applied on each list entry separately. Here is an example
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+prompt = get_prompt('generic/default')
+data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
+print(prompt.fill(data, multi_turn_key='turns'))
+```
+
+which outputs
+
+```python-console
+[
+  {
+    'role': 'user',
+    'content': "What's 2 + 2?"
+  },
+  {
+    'role': 'assistant',
+    'content': "easy, that's 5!"
   },
   {
     'role': 'user',
-    'content': "Solve the following math problem. Make sure to put the answer (and only answer) inside \\boxed{}.\n\nWhat's 2 + 2?"
+    'content': 'Can you double check?'
   }
 ]
 ```
 
-You can also have a look at the [tests](https://github.com/NVIDIA/NeMo-Skills/tests/test_prompts.py) to see more examples of using our prompt API.
+or if using template
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+prompt = get_prompt('generic/default', 'llama3-instruct')
+data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
+print(prompt.fill(data, multi_turn_key='turns'))
+```
+
+which outputs
+
+```python-console
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+easy, that's 5!<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Can you double check?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
+
+For an example of how to use it in real data file, see [mt-bench dataset](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/dataset/mt-bench).
diff --git a/docs/index.md b/docs/index.md
@@ -13,9 +13,10 @@ Here are some of the things we support.
   and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) servers and easily convert checkpoints from one format to another.
 - [Model evaluation](pipelines/evaluation.md): Evaluate your models on many popular benchmarks
     - Math problem solving: gsm8k, math, amc23, aime24, omni-math (and many more)
+    - Formal proofs in Lean: minif2f, proofnet
     - Coding skills: human-eval, mbpp
-    - Chat/instruction following: ifeval, arena-hard
-    - General knowledge: mmlu (generative)
+    - Chat/instruction following: ifeval, arena-hard, mt-bench
+    - General knowledge: mmlu (generative), mmlu-pro
 - [Model training](pipelines/training.md): Train models at speed-of-light using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/).
 
 To get started, follow the [prerequisites](basics/prerequisites.md) and then run `ns --help` to see all available