Merge branch 'main' of github.com:NVIDIA/NeMo-Aligner into ashors/top…

…k-logits
NVIDIA · Oct 25, 2024 · bf24e5c · bf24e5c
2 parents 7179eb0 + b1a0140
commit bf24e5c
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 42 deletions.
diff --git a/.github/workflows/_build_container.yml b/.github/workflows/_build_container.yml
@@ -29,12 +29,6 @@ jobs:
   main:
     runs-on: self-hosted-azure-builder
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          path: ${{ github.run_id }}
-          ref: ${{ inputs.ref }}
-
       - name: Clean runner cache
         run: | 
           docker system prune --filter "until=24h" --force
@@ -58,7 +52,7 @@ jobs:
           push: true
           build-args: |
             MAX_JOBS=32
-            ALIGNER_COMMIT=${{ github.event.pull_request.head.sha || github.sha }}
+            ALIGNER_COMMIT=${{ inputs.ref }}
           cache-from: |
             nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number || 'buildcache' }}
             nemoci.azurecr.io/nemo_aligner_container:buildcache

diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
@@ -47,35 +47,44 @@ jobs:
 
         - name: Docker pull image
           run: |
-            #docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }}
-            docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }}
+            docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }}
 
+        - name: Start container
+          run: |
+            docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
+              --env TRANSFORMERS_OFFLINE=0 \
+              --env HYDRA_FULL_ERROR=1 \
+              --env HF_HOME=/home/TestData/aligner/hf_home \
+              --env ALIGNER_CI_DIR=/home/TestData/aligner \
+              --env ALIGNER_REPO_DIR=/opt/NeMo-Aligner \
+              --volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
+              --volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
+              nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} \
+              bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+  
         - id: main
           name: Run main script
           timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
+            # Print the host driver for debugging
             nvidia-smi
             mkdir -p ${{ github.run_id }}
             cd ${{ github.run_id }}/
+
             set +e
             (
-              set -e
+            set -e
 
-              cmd=$(cat <<"RUN_TEST_EOF"
+            cmd=$(cat <<"RUN_TEST_EOF"
+            nvidia-smi
             # Sanity check the driver/cuda combo
             cudaCheck
             # In case git commands need to be run inside Aligner
             git config --global --add safe.directory $ALIGNER_REPO_DIR
             ${{ inputs.SCRIPT }}
             RUN_TEST_EOF
             )
-              #docker run --rm --runtime=nvidia --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData:ro nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
-              docker run --rm --runtime=nvidia --gpus all --shm-size=8g \
-                --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env HF_HOME=/home/TestData/aligner/hf_home --env ALIGNER_CI_DIR=/home/TestData/aligner --env ALIGNER_REPO_DIR=/opt/NeMo-Aligner \
-                --volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
-                --volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
-                nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }} \
-                bash -eux -o pipefail -c "$cmd"
+            docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
             ) 2> >(tee err.log)
 
             EXIT_CODE=$?
@@ -86,17 +95,18 @@ jobs:
 
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: failure() && inputs.IS_OPTIONAL == false
+
         - name: after_script
           if: always() && inputs.AFTER_SCRIPT != ':'
           run: |
             cmd=$(cat <<"RUN_TEST_EOF"
             ${{ inputs.AFTER_SCRIPT }}
             RUN_TEST_EOF
             )
-            #docker run --rm --runtime=nvidia --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
-            docker run --rm --runtime=nvidia --gpus all --shm-size=8g \
-              --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env HF_HOME=/home/TestData/aligner/hf_home --env ALIGNER_CI_DIR=/home/TestData/aligner \
-              --volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
-              --volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
-              nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }} \
-              bash -eux -o pipefail -c "$cmd"
+            docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
+
+        - name: Container shutdown
+          if: always()
+          run: |
+            docker container stop nemo_container_${{ github.run_id }} || true
+            docker container rm nemo_container_${{ github.run_id }} || true
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -59,18 +59,17 @@ jobs:
   Unit_Tests:
     needs: [build-container, pre-flight]
     uses: ./.github/workflows/_run_test.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'unit') # || needs.pre-flight.outputs.all == 'true' ## TODO: comment back in
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'unit') || needs.pre-flight.outputs.all == 'true'
     with:
-      IS_OPTIONAL: true
       RUNNER: self-hosted-azure
-      TIMEOUT: 20
+      TIMEOUT: 10
       SCRIPT: |
         nvidia-smi
         cd ${ALIGNER_REPO_DIR}
-        torchrun --nproc_per_node 2 -m pytest . -rA -s -x
+        bash tests/run_unit.sh
 
   Functional_Tests:
-    name: ${{ matrix.TEST_NAME }}
+    name: ${{ matrix.test_case }}
     needs: [build-container, pre-flight]
     uses: ./.github/workflows/_run_test.yml
     if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'functional') || needs.pre-flight.outputs.all == 'true'
@@ -81,12 +80,12 @@ jobs:
           - dpo-llama3
           - kd-llama3
     with:
-      IS_OPTIONAL: true
       RUNNER: self-hosted-azure
       # Fairly aggresive timeout that all functional tests should try to adhere to
       TIMEOUT: 10
       SCRIPT: |
-        export PYTHONPATH=/opt/NeMo-Aligner:${PYTHONPATH:-}
-        git config --global --add safe.directory /opt/NeMo-Aligner
+        export PYTHONPATH=${ALIGNER_REPO_DIR}:${PYTHONPATH:-}
+        nvidia-smi
+        git config --global --add safe.directory ${ALIGNER_REPO_DIR}
         cd ${ALIGNER_REPO_DIR}
         bash tests/functional/test_cases/${{ matrix.test_case }}.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,6 +17,12 @@ default_language_version:
 
 ci:
   autofix_prs: true
+  autofix_commit_msg: |
+        [pre-commit.ci] auto fixes from pre-commit.com hooks
+
+        for more information, see https://pre-commit.ci
+
+        Signed-off-by: NeMo-Aligner CI <[email protected]>
   autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
   autoupdate_schedule: quarterly
 

diff --git a/Dockerfile b/Dockerfile
@@ -30,8 +30,7 @@ if [[ ! -d NeMo-Aligner ]]; then
     git clone https://github.com/NVIDIA/NeMo-Aligner.git
 fi
 cd NeMo-Aligner
-git fetch -a
-# -f since git status may not be clean
+git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge'
 git checkout -f $ALIGNER_COMMIT
 # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it
 # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -18,13 +18,18 @@
 def pytest_addoption(parser):
     """
     Additional command-line arguments passed to pytest.
-        --cpu: use CPU during testing (DEFAULT: GPU)
     """
     parser.addoption(
         "--cpu", action="store_true", help="pass that argument to use CPU during testing (DEFAULT: False = GPU)"
     )
 
 
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
+    )
+
+
 @pytest.fixture
 def device(request):
     """ Simple fixture returning string denoting the device [CPU | GPU] """
@@ -39,9 +44,3 @@ def run_only_on_device_fixture(request, device):
     if request.node.get_closest_marker("run_only_on"):
         if request.node.get_closest_marker("run_only_on").args[0] != device:
             pytest.skip("skipped on this device: {}".format(device))
-
-
-def pytest_configure(config):
-    config.addinivalue_line(
-        "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
-    )
diff --git a/tests/run_unit.sh b/tests/run_unit.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+NUM_GPUS_AVAILABLE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+if [[ $NUM_GPUS_AVAILABLE -lt 2 ]]; then
+    echo "[ERROR]: Unit tests require at least 2 gpus"
+    exit 1
+fi
+
+export PYTHONPATH=$(realpath ..):${PYTHONPATH:-}
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 -m pytest .. -rA -s -x $@
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
@@ -109,6 +109,7 @@ def loss_func(
     return loss
 
 
+@pytest.mark.skip(reason="Tests currently hang and causes long delays")
 class TestDistributedFunctions:
     def _init_distributed(self, local_rank, main_address, main_port, nprocs):
         if torch.distributed.is_available() and not torch.distributed.is_initialized():