Skip to content

Commit

Permalink
Merge pull request #180 from aws-deepracer-community/dev
Browse files Browse the repository at this point in the history
Combined Image and Metrics enhancements
  • Loading branch information
larsll authored Jul 16, 2024
2 parents f28f28c + 22da1f8 commit 7ca8e5e
Show file tree
Hide file tree
Showing 16 changed files with 176 additions and 94 deletions.
57 changes: 39 additions & 18 deletions bin/activate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ if [[ "$(type service 2>/dev/null)" ]]; then
service docker status >/dev/null || sudo service docker start
fi

## Check if WSL2
if grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
IS_WSL2="yes"
fi

# Check if we will use Docker Swarm or Docker Compose
# If not defined then use Swarm
if [[ -z "${DR_DOCKER_STYLE}" ]]; then
Expand Down Expand Up @@ -130,10 +135,25 @@ else
DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml"
fi

# Prevent docker swarms to restart
# Add host X support for Linux and WSL2
if [[ "${DR_HOST_X,,}" == "true" ]]; then
DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
if [[ "$IS_WSL2" == "yes" ]]; then

# Check if package x11-server-utils is installed
if ! command -v xset &> /dev/null; then
echo "WARNING: Package x11-server-utils is not installed. Please install it to enable X11 support."
fi

if [[ "${DR_DOCKER_STYLE,,}" == "swarm" && "${DR_USE_GUI,,}" == "true" ]]; then
echo "WARNING: Cannot use GUI in Swarm mode. Please switch to Compose mode."
fi

DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg-wsl.yml"
DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg-wsl.yml"
else
DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
fi
fi

# Prevent docker swarms to restart
Expand Down Expand Up @@ -184,24 +204,25 @@ if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then
fi

## Version check
DEPENDENCY_VERSION=$(jq -r '.master_version | select (.!=null)' $DIR/defaults/dependencies.json)

SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE 2>/dev/null | jq -r .[].Config.Labels.version)
if [ -z "$SAGEMAKER_VER" ]; then SAGEMAKER_VER=$DR_SAGEMAKER_IMAGE; fi
if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then
echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER."
if [[ -z "$DR_SIMAPP_SOURCE" || -z "$DR_SIMAPP_VERSION" ]]; then
DEFAULT_SIMAPP_VERSION=$(jq -r '.containers.simapp | select (.!=null)' $DIR/defaults/dependencies.json)
echo "ERROR: Variable DR_SIMAPP_SOURCE or DR_SIMAPP_VERSION not defined."
echo ""
echo "As of version 5.3 the variables DR_SIMAPP_SOURCE and DR_SIMAPP_VERSION are required in system.env."
echo "To continue to use the separate Sagemaker, Robomaker and RL Coach images, run 'git checkout legacy'."
echo ""
echo "Please add the following lines to your system.env file:"
echo "DR_SIMAPP_SOURCE=awsdeepracercommunity/deepracer-simapp"
echo "DR_SIMAPP_VERSION=${DEFAULT_SIMAPP_VERSION}-gpu"
return
fi

ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE 2>/dev/null | jq -r .[].Config.Labels.version)
if [ -z "$ROBOMAKER_VER" ]; then ROBOMAKER_VER=$DR_ROBOMAKER_IMAGE; fi
if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then
echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER."
fi
DEPENDENCY_VERSION=$(jq -r '.master_version | select (.!=null)' $DIR/defaults/dependencies.json)

COACH_VER=$(docker inspect awsdeepracercommunity/deepracer-rlcoach:$DR_COACH_IMAGE 2>/dev/null | jq -r .[].Config.Labels.version)
if [ -z "$COACH_VER" ]; then COACH_VER=$DR_COACH_IMAGE; fi
if ! verlte $DEPENDENCY_VERSION $COACH_VER; then
echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER."
SIMAPP_VER=$(docker inspect ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION} 2>/dev/null | jq -r .[].Config.Labels.version)
if [ -z "$SIMAPP_VER" ]; then SIMAPP_VER=$SIMAPP_VERSION; fi
if ! verlte $DEPENDENCY_VERSION $SIMAPP_VER; then
echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SIMAPP_VER."
fi

## Create a dr-local-aws command
Expand Down
49 changes: 27 additions & 22 deletions bin/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,35 +142,40 @@ for arg in "$@"; do
done

# Download docker images. Change to build statements if locally built images are desired.
COACH_VERSION=$(jq -r '.containers.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
sed -i "s/<COACH_TAG>/$COACH_VERSION/g" $INSTALL_DIR/system.env

ROBOMAKER_VERSION=$(jq -r '.containers.robomaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
if [ -n $ROBOMAKER_VERSION ]; then
ROBOMAKER_VERSION=$ROBOMAKER_VERSION-$CPU_LEVEL
else
ROBOMAKER_VERSION=$CPU_LEVEL
fi
sed -i "s/<ROBO_TAG>/$ROBOMAKER_VERSION/g" $INSTALL_DIR/system.env

SAGEMAKER_VERSION=$(jq -r '.containers.sagemaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
if [ -n $SAGEMAKER_VERSION ]; then
SAGEMAKER_VERSION=$SAGEMAKER_VERSION-$SAGEMAKER_TAG
else
SAGEMAKER_VERSION=$SAGEMAKER_TAG
fi
sed -i "s/<SAGE_TAG>/$SAGEMAKER_VERSION/g" $INSTALL_DIR/system.env

docker pull awsdeepracercommunity/deepracer-rlcoach:$COACH_VERSION
docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_VERSION
docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION
SIMAPP_VERSION=$(jq -r '.containers.simapp | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
sed -i "s/<SIMAPP_VERSION_TAG>/$SIMAPP_VERSION-$SAGEMAKER_TAG/g" $INSTALL_DIR/system.env
docker pull awsdeepracercommunity/deepracer-simapp:$SIMAPP_VERSION-$SAGEMAKER_TAG

# create the network sagemaker-local if it doesn't exit
SAGEMAKER_NW='sagemaker-local'

if [[ "${OPT_STYLE}" == "swarm" ]]; then

docker node ls >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
echo "Swarm exists. Exiting."
exit 1
fi

docker swarm init
if [ $? -ne 0 ]; then

DEFAULT_IFACE=$(ip route | grep default | awk '{print $5}')
DEFAULT_IP=$(ip addr show $DEFAULT_IFACE | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)

if [ -z "$DEFAULT_IP" ]; then
echo "Could not determine default IP address. Exiting."
exit 1
fi

echo "Error when creating swarm, trying again with advertise address $DEFAULT_IP."
docker swarm init --advertise-addr $DEFAULT_IP
if [ $? -ne 0 ]; then
echo "Cound not create swarm. Exiting."
exit 1
fi
fi

SWARM_NODE=$(docker node inspect self | jq .[0].ID -r)
docker node update --label-add Sagemaker=true $SWARM_NODE >/dev/null 2>/dev/null
docker node update --label-add Robomaker=true $SWARM_NODE >/dev/null 2>/dev/null
Expand Down
38 changes: 21 additions & 17 deletions bin/scripts_wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function dr-increment-training {
}

function dr-stop-training {
ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/training && ./stop.sh"
bash -c "cd $DR_DIR/scripts/training && ./stop.sh"
}

function dr-start-evaluation {
Expand All @@ -55,21 +55,21 @@ function dr-start-evaluation {
}

function dr-stop-evaluation {
ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh"
bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh"
}

function dr-start-tournament {
echo "Tournaments are no longer supported. Use Head-to-Model evaluation instead."
}

function dr-start-loganalysis {
ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh"
bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh"
}

function dr-stop-loganalysis {
eval LOG_ANALYSIS_ID=$(docker ps | awk ' /deepracer-analysis/ { print $1 }')
if [ -n "$LOG_ANALYSIS_ID" ]; then
ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh"
bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh"
else
echo "Log-analysis is not running."
fi
Expand Down Expand Up @@ -138,19 +138,23 @@ function dr-find-sagemaker {
STACK_NAME="deepracer-$DR_RUN_ID"
RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX}

SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } ' | xargs)

if [[ -n $SAGEMAKER_CONTAINERS ]]; then
for CONTAINER in $SAGEMAKER_CONTAINERS; do
CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1')
COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2')
COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX)
if [[ -n $COMPOSE_FILE ]]; then
echo $CONTAINER
return
fi
done
SAGEMAKER_CONTAINERS=$(docker ps | awk ' /simapp/ { print $1 } ' | xargs)

if [[ -n "$SAGEMAKER_CONTAINERS" ]]; then
for CONTAINER in $SAGEMAKER_CONTAINERS; do
CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)-(algo-(.)-(.*))/; print $1')
COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)-(algo-(.)-(.*))/; print $2')

if [[ -n "$COMPOSE_SERVICE_NAME" ]]; then
COMPOSE_FILES=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$COMPOSE_SERVICE_NAME" {} +)
for COMPOSE_FILE in $COMPOSE_FILES; do
if sudo grep -q "RUN_ID=${DR_RUN_ID}" $COMPOSE_FILE && sudo grep -q "${RUN_NAME}" $COMPOSE_FILE; then
echo $CONTAINER
fi
done
fi
done
fi

}
Expand Down
6 changes: 2 additions & 4 deletions defaults/dependencies.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
{
"master_version": "5.2",
"master_version": "5.3",
"containers": {
"rl_coach": "5.2.1",
"robomaker": "5.2.2",
"sagemaker": "5.2.1"
"simapp": "5.3.1"
}
}
6 changes: 3 additions & 3 deletions defaults/template-system.env
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ DR_KINESIS_STREAM_NAME=
DR_CAMERA_MAIN_ENABLE=True
DR_CAMERA_SUB_ENABLE=False
DR_CAMERA_KVS_ENABLE=True
DR_SAGEMAKER_IMAGE=<SAGE_TAG>
DR_ROBOMAKER_IMAGE=<ROBO_TAG>
DR_ENABLE_EXTRA_KVS_OVERLAY=False
DR_SIMAPP_SOURCE=awsdeepracercommunity/deepracer-simapp
DR_SIMAPP_VERSION=<SIMAPP_VERSION_TAG>
DR_MINIO_IMAGE=latest
DR_ANALYSIS_IMAGE=cpu
DR_COACH_IMAGE=<COACH_TAG>
DR_WORKERS=1
DR_ROBOMAKER_MOUNT_LOGS=False
# DR_ROBOMAKER_MOUNT_SIMAPP_DIR=
Expand Down
7 changes: 3 additions & 4 deletions docker/docker-compose-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,17 @@ networks:

services:
rl_coach:
image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE}
image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"]
robomaker:
image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE}
command: ["${ROBOMAKER_COMMAND}"]
image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
command: ["${ROBOMAKER_COMMAND:-}"]
ports:
- "${DR_ROBOMAKER_EVAL_PORT}:8080"
environment:
- CUDA_VISIBLE_DEVICES=${DR_ROBOMAKER_CUDA_DEVICES:-}
- DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD}
- WORLD_NAME=${DR_WORLD_NAME}
- NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES}
- MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
- MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
- APP_REGION=${DR_AWS_APP_REGION}
Expand Down
15 changes: 15 additions & 0 deletions docker/docker-compose-local-xorg-wsl.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3.7'

services:
robomaker:
environment:
- DISPLAY
- USE_EXTERNAL_X=${DR_HOST_X}
- QT_X11_NO_MITSHM=1
- LD_LIBRARY_PATH=/usr/lib/wsl/lib
volumes:
- '/tmp/.X11-unix/:/tmp/.X11-unix'
- '/mnt/wslg:/mnt/wslg'
- '/usr/lib/wsl:/usr/lib/wsl'
devices:
- /dev/dxg
16 changes: 11 additions & 5 deletions docker/docker-compose-training.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ networks:

services:
rl_coach:
image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE}
image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
command: ["python3 /opt/code/ml/rl_coach/start.py"]
working_dir: "/opt/ml/code/"
environment:
- RUN_ID=${DR_RUN_ID}
- AWS_REGION=${DR_AWS_APP_REGION}
- SAGEMAKER_IMAGE=${DR_SAGEMAKER_IMAGE}
- SAGEMAKER_IMAGE=${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
- PRETRAINED=${DR_LOCAL_S3_PRETRAINED}
- PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX}
- PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
Expand All @@ -21,12 +24,15 @@ services:
- MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY}
- CUDA_VISIBLE_DEVICES=${DR_SAGEMAKER_CUDA_DEVICES:-}
- MAX_MEMORY_STEPS=${DR_TRAIN_MAX_STEPS_PER_ITERATION:-}
- TELEGRAF_HOST=${DR_TELEGRAF_HOST:-}
- TELEGRAF_PORT=${DR_TELEGRAF_PORT:-}

volumes:
- "/var/run/docker.sock:/var/run/docker.sock"
- "/tmp/sagemaker:/tmp/sagemaker"
robomaker:
image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE}
command: ["${ROBOMAKER_COMMAND}"]
image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
command: ["${ROBOMAKER_COMMAND:-}"]
ports:
- "${DR_ROBOMAKER_TRAIN_PORT}:8080"
- "${DR_ROBOMAKER_GUI_PORT}:5900"
Expand All @@ -47,4 +53,4 @@ services:
- GAZEBO_ARGS=${DR_GAZEBO_ARGS:-}
- TELEGRAF_HOST=${DR_TELEGRAF_HOST:-}
- TELEGRAF_PORT=${DR_TELEGRAF_PORT:-}
init: true
init: true
13 changes: 12 additions & 1 deletion docs/opengl.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,22 @@ This also applies for a desktop computer where you are not logged in. In this ca
* Setup an X-server on the host. `utils/setup-xorg.sh` is a basic installation script.
* Configure DRfC using the following settings in `system.env`:
* `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
* `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU.
* `DR_DISPLAY`; the X display that the headless X server will start on. (Default is `:99`, avoid using `:0` or `:1` as it may conflict with other X servers.)

Start up the X server with `utils/start-xorg.sh`.

If `DR_GUI_ENABLE=True` then a VNC server will be started on port 5900 so that you can connect and interact with the Gazebo UI.

Check that OpenGL is working by looking for `gzserver` in `nvidia-smi`.

## WSL2 on Windows 11

OpenGL is also supported in WSL2 on Windows 11. By default an Xwayland server is started in Ubuntu 22.04.

To enable OpenGL acceleration perform the following steps:
* Install x11-server-utils with `sudo apt install x11-server-utils`.
* Configure DRfC using the following settings in `system.env`:
* `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
* `DR_DISPLAY=:0`; the Xwayland starts on :0 by default.

If you want to interact with the Gazebo UI, set `DR_DOCKER_STYLE=compose` and `DR_GUI_ENABLE=True` in `system.env`.
1 change: 1 addition & 0 deletions scripts/evaluation/prepare-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def str2bool(v):
config['CAMERA_MAIN_ENABLE'] = os.environ.get('DR_CAMERA_MAIN_ENABLE', 'True')
config['CAMERA_SUB_ENABLE'] = os.environ.get('DR_CAMERA_SUB_ENABLE', 'True')
config['REVERSE_DIR'] = os.environ.get('DR_EVAL_REVERSE_DIRECTION', False)
config['ENABLE_EXTRA_KVS_OVERLAY'] = os.environ.get('DR_ENABLE_EXTRA_KVS_OVERLAY', 'False')

# Object Avoidance
if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
Expand Down
15 changes: 12 additions & 3 deletions scripts/evaluation/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ while getopts ":qc" opt; do
esac
done

## Check if WSL2
if grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
IS_WSL2="yes"
fi

# set evaluation specific environment variables
STACK_NAME="deepracer-eval-$DR_RUN_ID"
STACK_CONTAINERS=$(docker stack ps $STACK_NAME 2>/dev/null | wc -l)
Expand All @@ -44,6 +49,10 @@ if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
fi
fi

echo "Evaluation of model s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX starting."
echo "Using image ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}"
echo ""

# clone if required
if [ -n "$OPT_CLONE" ]; then
echo "Cloning model into s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E"
Expand Down Expand Up @@ -79,14 +88,14 @@ if [[ "${DR_HOST_X,,}" == "true" ]]; then

if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then
echo "No X Server running on display $ROBO_DISPLAY. Exiting"
exit 0
exit 1
fi

if [[ -z "$XAUTHORITY" ]]; then
if [[ -z "$XAUTHORITY" && "$IS_WSL2" != "yes" ]]; then
export XAUTHORITY=~/.Xauthority
if [[ ! -f "$XAUTHORITY" ]]; then
echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping."
exit 0
exit 1
fi
fi
fi
Expand Down
Loading

0 comments on commit 7ca8e5e

Please sign in to comment.