From a58d358eb3c182d7087c30aab9ed3b6bdbe636d3 Mon Sep 17 00:00:00 2001 From: Alexandria Barghi Date: Wed, 27 Dec 2023 10:31:19 -0800 Subject: [PATCH] remove test files --- .../bulk_sampling/mg_utils/README.md | 6 - .../bulk_sampling/mg_utils/default-config.sh | 39 --- .../bulk_sampling/mg_utils/functions.sh | 66 ----- .../mg_utils/run-dask-process.sh | 247 ------------------ .../mg_utils/wait_for_workers.py | 124 --------- 5 files changed, 482 deletions(-) delete mode 100644 benchmarks/cugraph/standalone/bulk_sampling/mg_utils/README.md delete mode 100755 benchmarks/cugraph/standalone/bulk_sampling/mg_utils/default-config.sh delete mode 100644 benchmarks/cugraph/standalone/bulk_sampling/mg_utils/functions.sh delete mode 100755 benchmarks/cugraph/standalone/bulk_sampling/mg_utils/run-dask-process.sh delete mode 100644 benchmarks/cugraph/standalone/bulk_sampling/mg_utils/wait_for_workers.py diff --git a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/README.md b/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/README.md deleted file mode 100644 index 26dbbd5e705..00000000000 --- a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/README.md +++ /dev/null @@ -1,6 +0,0 @@ -This directory contains various scripts helpful for cugraph users and developers. - -The following scripts were copied from https://github.com/rapidsai/multi-gpu-tools and are useful for starting a dask cluster, which is needed by cugraph for multi-GPU support. -* `run-dask-process.sh` -* `functions.sh` -* `default-config.sh` diff --git a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/default-config.sh b/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/default-config.sh deleted file mode 100755 index 26cef2aee78..00000000000 --- a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/default-config.sh +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -THIS_DIR=$(cd $(dirname ${BASH_SOURCE[0]}) && pwd) - -# Most are defined using the bash := or :- syntax, which means they -# will be set only if they were previously unset. The project config -# is loaded first, which gives it the opportunity to override anything -# in this file that uses that syntax. If there are variables in this -# file that should not be overridded by a project, then they will -# simply not use that syntax and override, since these variables are -# read last. -SCRIPTS_DIR=$THIS_DIR -WORKSPACE=$THIS_DIR - -# These really should be oerridden by the project config! -CONDA_ENV=${CONDA_ENV:-rapids} - -GPUS_PER_NODE=${GPUS_PER_NODE:-8} -WORKER_RMM_POOL_SIZE=${WORKER_RMM_POOL_SIZE:-12G} -DASK_CUDA_INTERFACE=${DASK_CUDA_INTERFACE:-ib0} -DASK_SCHEDULER_PORT=${DASK_SCHEDULER_PORT:-8792} -DASK_DEVICE_MEMORY_LIMIT=${DASK_DEVICE_MEMORY_LIMIT:-auto} -DASK_HOST_MEMORY_LIMIT=${DASK_HOST_MEMORY_LIMIT:-auto} - -BUILD_LOG_FILE=${BUILD_LOG_FILE:-${RESULTS_DIR}/build_log.txt} -SCHEDULER_FILE=${SCHEDULER_FILE:-${WORKSPACE}/dask-scheduler.json} -DATE=${DATE:-$(date --utc "+%Y-%m-%d_%H:%M:%S")_UTC} -ENV_EXPORT_FILE=${ENV_EXPORT_FILE:-${WORKSPACE}/$(basename ${CONDA_ENV})-${DATE}.txt} diff --git a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/functions.sh b/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/functions.sh deleted file mode 100644 index 7eedb5f1b1f..00000000000 --- a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/functions.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This file is source'd from script-env.sh to add functions to the -# calling environment, hence no #!/bin/bash as the first line. This -# also assumes the variables used in this file have been defined -# elsewhere. - -NUMARGS=$# -ARGS=$* -function hasArg { - (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") -} - -function logger { - echo -e ">>>> $@" -} - -# Calling "setTee outfile" will cause all stdout and stderr of the -# current script to be output to "tee", which outputs to stdout and -# "outfile" simultaneously. This is useful by allowing a script to -# "tee" itself at any point without being called with tee. -_origFileDescriptorsSaved=0 -function setTee { - if [[ $_origFileDescriptorsSaved == 0 ]]; then - # Save off the original file descr 1 and 2 as 3 and 4 - exec 3>&1 4>&2 - _origFileDescriptorsSaved=1 - fi - teeFile=$1 - # Create a named pipe. - pipeName=$(mktemp -u) - mkfifo $pipeName - # Close the currnet 1 and 2 and restore to original (3, 4) in the - # event this function is called repeatedly. - exec 1>&- 2>&- - exec 1>&3 2>&4 - # Start a tee process reading from the named pipe. Redirect stdout - # and stderr to the named pipe which goes to the tee process. The - # named pipe "file" can be removed and the tee process stays alive - # until the fd is closed. - tee -a < $pipeName $teeFile & - exec > $pipeName 2>&1 - rm $pipeName -} - -# Call this to stop script output from going to "tee" after a prior -# call to setTee. -function unsetTee { - if [[ $_origFileDescriptorsSaved == 1 ]]; then - # Close the current fd 1 and 2 which should stop the tee - # process, then restore 1 and 2 to original (saved as 3, 4). - exec 1>&- 2>&- - exec 1>&3 2>&4 - fi -} diff --git a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/run-dask-process.sh b/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/run-dask-process.sh deleted file mode 100755 index b88abb685ec..00000000000 --- a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/run-dask-process.sh +++ /dev/null @@ -1,247 +0,0 @@ -#!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -THIS_DIR=$(cd $(dirname ${BASH_SOURCE[0]}) && pwd) - -source ${THIS_DIR}/default-config.sh -source ${THIS_DIR}/functions.sh - -# Logs can be written to a specific location by setting the LOGS_DIR -# env var. -LOGS_DIR=${LOGS_DIR:-dask_logs-$$} - -######################################## -NUMARGS=$# -ARGS=$* -function hasArg { - (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") -} -VALIDARGS="-h --help scheduler workers --tcp --ucx --ucxib --ucx-ib" -HELP="$0 [ ...] [ ...] - where is: - scheduler - start dask scheduler - workers - start dask workers - and is: - --tcp - initalize a tcp cluster (default) - --ucx - initialize a ucx cluster with NVLink - --ucxib | --ucx-ib - initialize a ucx cluster with IB+NVLink - -h | --help - print this text - - The cluster config order of precedence is any specification on the - command line (--tcp, --ucx, etc.) if provided, then the value of the - env var CLUSTER_CONFIG_TYPE if set, then the default value of tcp. - -" - -# CLUSTER_CONFIG_TYPE defaults to the env var value if set, else TCP -CLUSTER_CONFIG_TYPE=${CLUSTER_CONFIG_TYPE:-TCP} -START_SCHEDULER=0 -START_WORKERS=0 - -if (( ${NUMARGS} == 0 )); then - echo "${HELP}" - exit 0 -else - if hasArg -h || hasArg --help; then - echo "${HELP}" - exit 0 - fi - for a in ${ARGS}; do - if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then - echo "Invalid option: ${a}" - exit 1 - fi - done -fi - -if hasArg scheduler; then - START_SCHEDULER=1 -fi -if hasArg workers; then - START_WORKERS=1 -fi -# Allow the command line to take precedence -if hasArg --tcp; then - CLUSTER_CONFIG_TYPE=TCP -elif hasArg --ucx; then - CLUSTER_CONFIG_TYPE=UCX -elif hasArg --ucxib || hasArg --ucx-ib; then - CLUSTER_CONFIG_TYPE=UCXIB -fi - -######################################## - -#export DASK_LOGGING__DISTRIBUTED="DEBUG" - -#ulimit -n 100000 - -SCHEDULER_LOG=${LOGS_DIR}/scheduler_log.txt -WORKERS_LOG=${LOGS_DIR}/worker-${HOSTNAME}_log.txt - - -function buildTcpArgs { - export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s" - export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s" - export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s" - export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s" - export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False" - - SCHEDULER_ARGS="--protocol=tcp - --scheduler-file $SCHEDULER_FILE - " - - WORKER_ARGS="--rmm-pool-size=$WORKER_RMM_POOL_SIZE - --rmm-async - --local-directory=/tmp/$LOGNAME - --scheduler-file=$SCHEDULER_FILE - --memory-limit=$DASK_HOST_MEMORY_LIMIT - --device-memory-limit=$DASK_DEVICE_MEMORY_LIMIT - " - -} - -function buildUCXWithInfinibandArgs { - - export UCX_MAX_RNDV_RAILS=1 - export UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda - export DASK_RMM__POOL_SIZE=0.5GB - export DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True - - SCHEDULER_ARGS="--protocol=ucx - --interface=$DASK_CUDA_INTERFACE - --scheduler-file $SCHEDULER_FILE - " - - WORKER_ARGS="--interface=$DASK_CUDA_INTERFACE - --rmm-pool-size=$WORKER_RMM_POOL_SIZE - --rmm-maximum-pool-size=$WORKER_RMM_POOL_SIZE - --local-directory=/tmp/$LOGNAME - --scheduler-file=$SCHEDULER_FILE - --memory-limit=$DASK_HOST_MEMORY_LIMIT - --device-memory-limit=$DASK_DEVICE_MEMORY_LIMIT - --enable-jit-unspill - " -} - - -function buildUCXwithoutInfinibandArgs { - - export UCX_TCP_CM_REUSEADDR=y - export UCX_MAX_RNDV_RAILS=1 - export UCX_TCP_TX_SEG_SIZE=8M - export UCX_TCP_RX_SEG_SIZE=8M - - export DASK_DISTRIBUTED__COMM__UCX__CUDA_COPY=True - export DASK_DISTRIBUTED__COMM__UCX__TCP=True - export DASK_DISTRIBUTED__COMM__UCX__NVLINK=True - export DASK_DISTRIBUTED__COMM__UCX__INFINIBAND=False - export DASK_DISTRIBUTED__COMM__UCX__RDMACM=False - export DASK_RMM__POOL_SIZE=0.5GB - - - SCHEDULER_ARGS="--protocol=ucx - --scheduler-file $SCHEDULER_FILE - " - - WORKER_ARGS="--enable-tcp-over-ucx - --enable-nvlink - --disable-infiniband - --disable-rdmacm - --rmm-pool-size=$WORKER_RMM_POOL_SIZE - --rmm-maximum-pool-size=$WORKER_RMM_POOL_SIZE - --local-directory=/tmp/$LOGNAME - --scheduler-file=$SCHEDULER_FILE - --memory-limit=$DASK_HOST_MEMORY_LIMIT - --device-memory-limit=$DASK_DEVICE_MEMORY_LIMIT - --enable-jit-unspill - " -} - -if [[ "$CLUSTER_CONFIG_TYPE" == "UCX" ]]; then - logger "Using cluster configurtion for UCX" - buildUCXwithoutInfinibandArgs -elif [[ "$CLUSTER_CONFIG_TYPE" == "UCXIB" ]]; then - logger "Using cluster configurtion for UCX with Infiniband" - buildUCXWithInfinibandArgs -else - logger "Using cluster configurtion for TCP" - buildTcpArgs -fi - - -######################################## - -scheduler_pid="" -worker_pid="" -num_scheduler_tries=0 - -function startScheduler { - mkdir -p $(dirname $SCHEDULER_FILE) - echo "RUNNING: \"python -m distributed.cli.dask_scheduler $SCHEDULER_ARGS\"" > $SCHEDULER_LOG - dask-scheduler $SCHEDULER_ARGS >> $SCHEDULER_LOG 2>&1 & - scheduler_pid=$! -} - -mkdir -p $LOGS_DIR -logger "Logs written to: $LOGS_DIR" - -if [[ $START_SCHEDULER == 1 ]]; then - rm -f $SCHEDULER_FILE $SCHEDULER_LOG $WORKERS_LOG - - startScheduler - sleep 6 - num_scheduler_tries=$(python -c "print($num_scheduler_tries+1)") - - # Wait for the scheduler to start first before proceeding, since - # it may require several retries (if prior run left ports open - # that need time to close, etc.) - while [ ! -f "$SCHEDULER_FILE" ]; do - scheduler_alive=$(ps -p $scheduler_pid > /dev/null ; echo $?) - if [[ $scheduler_alive != 0 ]]; then - if [[ $num_scheduler_tries != 30 ]]; then - echo "scheduler failed to start, retry #$num_scheduler_tries" - startScheduler - sleep 6 - num_scheduler_tries=$(echo $num_scheduler_tries+1 | bc) - else - echo "could not start scheduler, exiting." - exit 1 - fi - fi - done - echo "scheduler started." -fi - -if [[ $START_WORKERS == 1 ]]; then - rm -f $WORKERS_LOG - while [ ! -f "$SCHEDULER_FILE" ]; do - echo "run-dask-process.sh: $SCHEDULER_FILE not present - waiting to start workers..." - sleep 2 - done - echo "RUNNING: \"python -m dask_cuda.cli.dask_cuda_worker $WORKER_ARGS\"" > $WORKERS_LOG - dask-cuda-worker $WORKER_ARGS >> $WORKERS_LOG 2>&1 & - worker_pid=$! - echo "worker(s) started." -fi - -# This script will not return until the following background process -# have been completed/killed. -if [[ $worker_pid != "" ]]; then - echo "waiting for worker pid $worker_pid to finish before exiting script..." - wait $worker_pid -fi -if [[ $scheduler_pid != "" ]]; then - echo "waiting for scheduler pid $scheduler_pid to finish before exiting script..." - wait $scheduler_pid -fi diff --git a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/wait_for_workers.py b/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/wait_for_workers.py deleted file mode 100644 index 29d5cb7fbd7..00000000000 --- a/benchmarks/cugraph/standalone/bulk_sampling/mg_utils/wait_for_workers.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import time -import yaml - -from dask.distributed import Client - - -def initialize_dask_cuda(communication_type): - communication_type = communication_type.lower() - if "ucx" in communication_type: - os.environ["UCX_MAX_RNDV_RAILS"] = "1" - - if communication_type == "ucx-ib": - os.environ["UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES"]="cuda" - os.environ["DASK_RMM__POOL_SIZE"]="0.5GB" - os.environ["DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT"]="True" - - -def wait_for_workers( - num_expected_workers, scheduler_file_path, communication_type, timeout_after=0 -): - """ - Waits until num_expected_workers workers are available based on - the workers managed by scheduler_file_path, then returns 0. If - timeout_after is specified, will return 1 if num_expected_workers - workers are not available before the timeout. - """ - # FIXME: use scheduler file path from global environment if none - # supplied in configuration yaml - - print("wait_for_workers.py - initializing client...", end="") - sys.stdout.flush() - initialize_dask_cuda(communication_type) - print("done.") - sys.stdout.flush() - - ready = False - start_time = time.time() - while not ready: - if timeout_after and ((time.time() - start_time) >= timeout_after): - print( - f"wait_for_workers.py timed out after {timeout_after} seconds before finding {num_expected_workers} workers." - ) - sys.stdout.flush() - break - with Client(scheduler_file=scheduler_file_path) as client: - num_workers = len(client.scheduler_info()["workers"]) - if num_workers < num_expected_workers: - print( - f"wait_for_workers.py expected {num_expected_workers} but got {num_workers}, waiting..." - ) - sys.stdout.flush() - time.sleep(5) - else: - print(f"wait_for_workers.py got {num_workers} workers, done.") - sys.stdout.flush() - ready = True - - if ready is False: - return 1 - return 0 - - -if __name__ == "__main__": - import argparse - - ap = argparse.ArgumentParser() - ap.add_argument( - "--num-expected-workers", - type=int, - required=False, - help="Number of workers to wait for. If not specified, " - "uses the NUM_WORKERS env var if set, otherwise defaults " - "to 16.", - ) - ap.add_argument( - "--scheduler-file-path", - type=str, - required=True, - help="Path to shared scheduler file to read.", - ) - ap.add_argument( - "--communication-type", - type=str, - default="tcp", - required=False, - help="Initiliaze dask_cuda based on the cluster communication type." - "Supported values are tcp(default), ucx, ucxib, ucx-ib.", - ) - ap.add_argument( - "--timeout-after", - type=int, - default=0, - required=False, - help="Number of seconds to wait for workers. " - "Default is 0 which means wait forever.", - ) - args = ap.parse_args() - - if args.num_expected_workers is None: - args.num_expected_workers = os.environ.get("NUM_WORKERS", 16) - - exitcode = wait_for_workers( - num_expected_workers=args.num_expected_workers, - scheduler_file_path=args.scheduler_file_path, - communication_type=args.communication_type, - timeout_after=args.timeout_after, - ) - - sys.exit(exitcode)