Skip to content

Commit

Permalink
just see if change in containerd config works
Browse files Browse the repository at this point in the history
  • Loading branch information
justinthelaw committed Sep 18, 2024
1 parent d6aacf0 commit c9e7840
Showing 1 changed file with 34 additions and 68 deletions.
102 changes: 34 additions & 68 deletions .github/workflows/e2e-vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,85 +113,51 @@ jobs:
uds zarf tools kubectl get nodes
uds zarf tools kubectl describe node k3d-uds-server-0
uds zarf tools kubectl get daemonset nvidia-device-plugin-daemonset -n kube-system
uds zarf tools kubectl exec -it daemonset/nvidia-device-plugin-daemonset -n kube-system -c nvidia-device-plugin-ctr -- nvidia-smi
# Apply the CUDA test pod
uds zarf tools kubectl apply -f packages/k3d-gpu/test/cuda-device-query.yaml
sleep 20
uds zarf tools kubectl logs -l app=gpu-pod --namespace=default
# Set a max number of retries
max_retries=5
retry_count=0
- name: Setup API and Supabase
uses: ./.github/actions/lfai-core

# While loop to check the pod status (retry max 5 times)
while [[ $(uds zarf tools kubectl get pod -l app=gpu-pod --namespace=default -o jsonpath='{.items[*].status.phase}') != "Succeeded" ]]; do
echo "Waiting for pod to complete..."
- name: Setup Python
uses: ./.github/actions/python
with:
additionalOptionalDep: dev-vllm

# Display pod details
echo "Fetching pod details..."
uds zarf tools kubectl describe pod -l app=gpu-pod --namespace=default
#######
# vllm
#######
- name: Deploy vLLM
run: |
make build-vllm LOCAL_VERSION=e2e-test DOCKER_FLAGS="--build-arg MAX_CONTEXT_LENGTH=500"
sleep 5
((retry_count++))
make local-registry
make sdk-wheel LOCAL_VERSION=e2e-test
docker build --build-arg MAX_CONTEXT_LENGTH=500 --build-arg LOCAL_VERSION=e2e-test -t ghcr.io/defenseunicorns/leapfrogai/vllm:e2e-test -f packages/vllm/Dockerfile .
docker tag ghcr.io/defenseunicorns/leapfrogai/vllm:e2e-test localhost:5000/defenseunicorns/leapfrogai/vllm:e2e-test
docker push localhost:5000/defenseunicorns/leapfrogai/vllm:e2e-test
# Break the loop after 5 retries
if [[ $retry_count -ge $max_retries ]]; then
echo "Max retries reached. Fetching pod logs and failure reason..."
uds zarf package create packages/vllm --flavor upstream -o packages/vllm --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=e2e-test --confirm
# Fetch pod logs
uds zarf tools kubectl logs -l app=gpu-pod --namespace=default
docker image prune -af
# Fetch the reason for failure
echo "Fetching failure reason..."
uds zarf tools kubectl get pod -l app=gpu-pod --namespace=default -o jsonpath='{.items[*].status.containerStatuses[*].state.terminated.reason}'
uds zarf package deploy packages/vllm/zarf-package-vllm-amd64-e2e-test.tar.zst -l=trace --confirm
rm packages/vllm/zarf-package-vllm-amd64-e2e-test.tar.zst
break
fi
# Check vLLM deployment logs for issues
while [[ $(uds zarf tools kubectl get pod -l app=lfai-vllm --namespace=leapfrogai -o jsonpath='{.items[*].status.phase}') != "Running" ]]; do
echo "Waiting for pod to be ready..."
sleep 5
done
# If pod succeeded, display logs
if [[ $(uds zarf tools kubectl get pod -l app=gpu-pod --namespace=default -o jsonpath='{.items[*].status.phase}') == "Succeeded" ]]; then
echo "Pod completed successfully!"
uds zarf tools kubectl logs -l app=gpu-pod --namespace=default
fi
# - name: Setup API and Supabase
# uses: ./.github/actions/lfai-core

# - name: Setup Python
# uses: ./.github/actions/python
# with:
# additionalOptionalDep: dev-vllm

# #######
# # vllm
# #######
# - name: Deploy vLLM
# run: |
# make build-vllm LOCAL_VERSION=e2e-test DOCKER_FLAGS="--build-arg MAX_CONTEXT_LENGTH=500"
uds zarf tools kubectl logs -n leapfrogai deployment/vllm-model
# make local-registry
# make sdk-wheel LOCAL_VERSION=e2e-test
# docker build --build-arg MAX_CONTEXT_LENGTH=500 --build-arg LOCAL_VERSION=e2e-test -t ghcr.io/defenseunicorns/leapfrogai/vllm:e2e-test -f packages/vllm/Dockerfile .
# docker tag ghcr.io/defenseunicorns/leapfrogai/vllm:e2e-test localhost:5000/defenseunicorns/leapfrogai/vllm:e2e-test
# docker push localhost:5000/defenseunicorns/leapfrogai/vllm:e2e-test

# uds zarf package create packages/vllm --flavor upstream -o packages/vllm --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=e2e-test --confirm

# docker image prune -af

# uds zarf package deploy packages/vllm/zarf-package-vllm-amd64-e2e-test.tar.zst -l=trace --confirm
# rm packages/vllm/zarf-package-vllm-amd64-e2e-test.tar.zst

# # Check vLLM deployment logs for issues
# while [[ $(uds zarf tools kubectl get pod -l app=lfai-vllm --namespace=leapfrogai -o jsonpath='{.items[*].status.phase}') != "Running" ]]; do
# echo "Waiting for pod to be ready..."
# sleep 5
# done

# uds zarf tools kubectl logs -n leapfrogai deployment/vllm-model

# - name: Test vLLM
# env:
# MODEL_NAME: vllm
# run: |
# python -m pytest ./tests/e2e/test_llm_generation.py -vv
- name: Test vLLM
env:
MODEL_NAME: vllm
run: |
python -m pytest ./tests/e2e/test_llm_generation.py -vv

0 comments on commit c9e7840

Please sign in to comment.