Skip to content

Commit

Permalink
fix merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarghi-nv committed Jan 18, 2024
2 parents cff6cdf + c5d2a9a commit 5e5094a
Show file tree
Hide file tree
Showing 101 changed files with 3,215 additions and 615 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,43 @@ jobs:
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
package-name: nx-cugraph
wheel-build-cugraph-dgl:
needs: wheel-publish-cugraph
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
script: ci/build_wheel_cugraph-dgl.sh
wheel-publish-cugraph-dgl:
needs: wheel-build-cugraph-dgl
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
package-name: cugraph-dgl
wheel-build-cugraph-pyg:
needs: wheel-publish-cugraph
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
script: ci/build_wheel_cugraph-pyg.sh
wheel-publish-cugraph-pyg:
needs: wheel-build-cugraph-pyg
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
sha: ${{ inputs.sha }}
date: ${{ inputs.date }}
package-name: cugraph-pyg
34 changes: 34 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ jobs:
- wheel-tests-cugraph
- wheel-build-nx-cugraph
- wheel-tests-nx-cugraph
- wheel-build-cugraph-dgl
- wheel-tests-cugraph-dgl
- wheel-build-cugraph-pyg
- wheel-tests-cugraph-pyg
- devcontainer
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
Expand Down Expand Up @@ -127,6 +131,36 @@ jobs:
with:
build_type: pull-request
script: ci/test_wheel_nx-cugraph.sh
wheel-build-cugraph-dgl:
needs: wheel-tests-cugraph
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
script: ci/build_wheel_cugraph-dgl.sh
wheel-tests-cugraph-dgl:
needs: wheel-build-cugraph-dgl
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
script: ci/test_wheel_cugraph-dgl.sh
matrix_filter: map(select(.ARCH == "amd64"))
wheel-build-cugraph-pyg:
needs: wheel-tests-cugraph
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
script: ci/build_wheel_cugraph-pyg.sh
wheel-tests-cugraph-pyg:
needs: wheel-build-cugraph-pyg
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
script: ci/test_wheel_cugraph-pyg.sh
matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
devcontainer:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
Expand Down
18 changes: 18 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,21 @@ jobs:
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/test_wheel_nx-cugraph.sh
wheel-tests-cugraph-dgl:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/test_wheel_cugraph-dgl.sh
wheel-tests-cugraph-pyg:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
script: ci/test_wheel_cugraph-pyg.sh
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ repos:
pass_filenames: false
additional_dependencies: [gitpython]
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.5.1
rev: v1.8.0
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
2 changes: 1 addition & 1 deletion benchmarks/cugraph/standalone/bulk_sampling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ You will need to modify the bash scripts to run appopriately for your environmen
desired training workflow. The standard sbatch arguments are at the top of the script, such as
job name, queue, etc. These will need to be modified for your SLURM cluster.

Next are arguments for the container image (which is currently set to the current DLFW image),
Next are arguments for the container image (required),
and directories where the data and outputs are stored. The directories default to subdirectories
of the current working directory. But if there is a high-throughput storage system available,
using that storage for the samples and datasets is highly recommended.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
os.environ["RAPIDS_NO_INITIALIZE"] = "1"
os.environ["CUDF_SPILL"] = "1"
os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
os.environ["KVIKIO_NTHREADS"] = "64"
os.environ["KVIKIO_NTHREADS"] = "8"

import argparse
import json
Expand Down
13 changes: 0 additions & 13 deletions benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ export CUDF_SPILL=1
export LIBCUDF_CUFILE_POLICY="OFF"
export GPUS_PER_NODE=8

PATCH_CUGRAPH=1

export SCHEDULER_FILE=$SCHEDULER_FILE
export LOGS_DIR=$LOGS_DIR

Expand All @@ -60,17 +58,6 @@ else
${MG_UTILS_DIR}/run-dask-process.sh workers &
fi

if [[ $PATCH_CUGRAPH == 1 ]]; then
mkdir /opt/cugraph-patch
git clone https://github.com/alexbarghi-nv/cugraph -b dlfw-patch-24.01 /opt/cugraph-patch

rm /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
rm /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py
cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py

fi

echo "properly waiting for workers to connect"
NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
handleTimeout 120 python ${MG_UTILS_DIR}/wait_for_workers.py \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#SBATCH -N 1
#SBATCH -t 00:25:00

CONTAINER_IMAGE="/lustre/fsw/rapids/abarghi/dlfw_patched.squash"
CONTAINER_IMAGE=${CONTAINER_IMAGE:="please_specify_container"}
SCRIPTS_DIR=$(pwd)
LOGS_DIR=${LOGS_DIR:=$(pwd)"/logs"}
SAMPLES_DIR=${SAMPLES_DIR:=$(pwd)/samples}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def train(self):
time_forward = 0.0
time_backward = 0.0
time_loader = 0.0
time_feature_additional = 0.0
time_feature_transfer = 0.0
start_time = time.perf_counter()
end_time_backward = start_time

Expand All @@ -73,7 +73,7 @@ def train(self):
loader_time_iter = time.perf_counter() - end_time_backward
time_loader += loader_time_iter

additional_feature_time_start = time.perf_counter()
time_feature_transfer_start = time.perf_counter()

num_sampled_nodes = sum(
[
Expand All @@ -94,9 +94,9 @@ def train(self):
num_sampled_edges = extend_tensor(num_sampled_edges, num_layers)

data = data.to_homogeneous().cuda()
additional_feature_time_end = time.perf_counter()
time_feature_additional += (
additional_feature_time_end - additional_feature_time_start
time_feature_transfer_end = time.perf_counter()
time_feature_transfer += (
time_feature_transfer_end - time_feature_transfer_start
)

num_batches += 1
Expand All @@ -113,6 +113,9 @@ def train(self):
logger.info(f"time forward: {time_forward_iter}")
logger.info(f"time backward: {time_backward_iter}")
logger.info(f"loader time: {loader_time_iter}")
logger.info(
f"feature transfer time: {time_feature_transfer / num_batches}"
)
logger.info(f"total time: {total_time_iter}")

y_true = data.y
Expand Down Expand Up @@ -253,7 +256,8 @@ def train(self):
stats = {
"Accuracy": float(acc_sum / (i) * 100.0) if self.rank == 0 else 0.0,
"# Batches": num_batches,
"Loader Time": time_loader + time_feature_additional,
"Loader Time": time_loader,
"Feature Transfer Time": time_feature_transfer,
"Forward Time": time_forward,
"Backward Time": time_backward,
}
Expand Down
Loading

0 comments on commit 5e5094a

Please sign in to comment.