Skip to content

Commit

Permalink
Merge branch 'main' of github.com:NVIDIA/NeMo-Aligner into ashors/top…
Browse files Browse the repository at this point in the history
…k-logits
  • Loading branch information
ashors1 committed Oct 25, 2024
2 parents 7179eb0 + b1a0140 commit bf24e5c
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 42 deletions.
8 changes: 1 addition & 7 deletions .github/workflows/_build_container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@ jobs:
main:
runs-on: self-hosted-azure-builder
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
ref: ${{ inputs.ref }}

- name: Clean runner cache
run: |
docker system prune --filter "until=24h" --force
Expand All @@ -58,7 +52,7 @@ jobs:
push: true
build-args: |
MAX_JOBS=32
ALIGNER_COMMIT=${{ github.event.pull_request.head.sha || github.sha }}
ALIGNER_COMMIT=${{ inputs.ref }}
cache-from: |
nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number || 'buildcache' }}
nemoci.azurecr.io/nemo_aligner_container:buildcache
Expand Down
46 changes: 28 additions & 18 deletions .github/workflows/_run_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,35 +47,44 @@ jobs:
- name: Docker pull image
run: |
#docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }}
docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }}
docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }}
- name: Start container
run: |
docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/aligner/hf_home \
--env ALIGNER_CI_DIR=/home/TestData/aligner \
--env ALIGNER_REPO_DIR=/opt/NeMo-Aligner \
--volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
--volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} \
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
run: |
# Print the host driver for debugging
nvidia-smi
mkdir -p ${{ github.run_id }}
cd ${{ github.run_id }}/
set +e
(
set -e
set -e
cmd=$(cat <<"RUN_TEST_EOF"
cmd=$(cat <<"RUN_TEST_EOF"
nvidia-smi
# Sanity check the driver/cuda combo
cudaCheck
# In case git commands need to be run inside Aligner
git config --global --add safe.directory $ALIGNER_REPO_DIR
${{ inputs.SCRIPT }}
RUN_TEST_EOF
)
#docker run --rm --runtime=nvidia --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData:ro nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
docker run --rm --runtime=nvidia --gpus all --shm-size=8g \
--env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env HF_HOME=/home/TestData/aligner/hf_home --env ALIGNER_CI_DIR=/home/TestData/aligner --env ALIGNER_REPO_DIR=/opt/NeMo-Aligner \
--volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
--volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }} \
bash -eux -o pipefail -c "$cmd"
docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
) 2> >(tee err.log)
EXIT_CODE=$?
Expand All @@ -86,17 +95,18 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: failure() && inputs.IS_OPTIONAL == false

- name: after_script
if: always() && inputs.AFTER_SCRIPT != ':'
run: |
cmd=$(cat <<"RUN_TEST_EOF"
${{ inputs.AFTER_SCRIPT }}
RUN_TEST_EOF
)
#docker run --rm --runtime=nvidia --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
docker run --rm --runtime=nvidia --gpus all --shm-size=8g \
--env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env HF_HOME=/home/TestData/aligner/hf_home --env ALIGNER_CI_DIR=/home/TestData/aligner \
--volume /mnt/datadrive/TestData/aligner/checkpoints:/home/TestData/aligner/checkpoints:ro \
--volume /mnt/datadrive/TestData/aligner/hf_home/hub:/home/TestData/aligner/hf_home/hub:ro \
nemoci.azurecr.io/nemo_aligner_container:${{ github.event.pull_request.number }} \
bash -eux -o pipefail -c "$cmd"
docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
- name: Container shutdown
if: always()
run: |
docker container stop nemo_container_${{ github.run_id }} || true
docker container rm nemo_container_${{ github.run_id }} || true
15 changes: 7 additions & 8 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,17 @@ jobs:
Unit_Tests:
needs: [build-container, pre-flight]
uses: ./.github/workflows/_run_test.yml
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'unit') # || needs.pre-flight.outputs.all == 'true' ## TODO: comment back in
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'unit') || needs.pre-flight.outputs.all == 'true'
with:
IS_OPTIONAL: true
RUNNER: self-hosted-azure
TIMEOUT: 20
TIMEOUT: 10
SCRIPT: |
nvidia-smi
cd ${ALIGNER_REPO_DIR}
torchrun --nproc_per_node 2 -m pytest . -rA -s -x
bash tests/run_unit.sh
Functional_Tests:
name: ${{ matrix.TEST_NAME }}
name: ${{ matrix.test_case }}
needs: [build-container, pre-flight]
uses: ./.github/workflows/_run_test.yml
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'functional') || needs.pre-flight.outputs.all == 'true'
Expand All @@ -81,12 +80,12 @@ jobs:
- dpo-llama3
- kd-llama3
with:
IS_OPTIONAL: true
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
TIMEOUT: 10
SCRIPT: |
export PYTHONPATH=/opt/NeMo-Aligner:${PYTHONPATH:-}
git config --global --add safe.directory /opt/NeMo-Aligner
export PYTHONPATH=${ALIGNER_REPO_DIR}:${PYTHONPATH:-}
nvidia-smi
git config --global --add safe.directory ${ALIGNER_REPO_DIR}
cd ${ALIGNER_REPO_DIR}
bash tests/functional/test_cases/${{ matrix.test_case }}.sh
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ default_language_version:

ci:
autofix_prs: true
autofix_commit_msg: |
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
Signed-off-by: NeMo-Aligner CI <[email protected]>
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
autoupdate_schedule: quarterly

Expand Down
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ if [[ ! -d NeMo-Aligner ]]; then
git clone https://github.com/NVIDIA/NeMo-Aligner.git
fi
cd NeMo-Aligner
git fetch -a
# -f since git status may not be clean
git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge'
git checkout -f $ALIGNER_COMMIT
# case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it
# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
Expand Down
13 changes: 6 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,18 @@
def pytest_addoption(parser):
"""
Additional command-line arguments passed to pytest.
--cpu: use CPU during testing (DEFAULT: GPU)
"""
parser.addoption(
"--cpu", action="store_true", help="pass that argument to use CPU during testing (DEFAULT: False = GPU)"
)


def pytest_configure(config):
config.addinivalue_line(
"markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
)


@pytest.fixture
def device(request):
""" Simple fixture returning string denoting the device [CPU | GPU] """
Expand All @@ -39,9 +44,3 @@ def run_only_on_device_fixture(request, device):
if request.node.get_closest_marker("run_only_on"):
if request.node.get_closest_marker("run_only_on").args[0] != device:
pytest.skip("skipped on this device: {}".format(device))


def pytest_configure(config):
config.addinivalue_line(
"markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
)
14 changes: 14 additions & 0 deletions tests/run_unit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR

NUM_GPUS_AVAILABLE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

if [[ $NUM_GPUS_AVAILABLE -lt 2 ]]; then
echo "[ERROR]: Unit tests require at least 2 gpus"
exit 1
fi

export PYTHONPATH=$(realpath ..):${PYTHONPATH:-}
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 -m pytest .. -rA -s -x $@
1 change: 1 addition & 0 deletions tests/test_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def loss_func(
return loss


@pytest.mark.skip(reason="Tests currently hang and causes long delays")
class TestDistributedFunctions:
def _init_distributed(self, local_rank, main_address, main_port, nprocs):
if torch.distributed.is_available() and not torch.distributed.is_initialized():
Expand Down

0 comments on commit bf24e5c

Please sign in to comment.