fix merge conflicts

rapidsai · Jan 18, 2024 · 5e5094a · 5e5094a
2 parents cff6cdf + c5d2a9a
commit 5e5094a
Show file tree

Hide file tree

Showing 101 changed files with 3,215 additions and 615 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -133,3 +133,43 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: nx-cugraph
+  wheel-build-cugraph-dgl:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-publish-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-dgl
+  wheel-build-cugraph-pyg:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-publish-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-pyg
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -25,6 +25,10 @@ jobs:
       - wheel-tests-cugraph
       - wheel-build-nx-cugraph
       - wheel-tests-nx-cugraph
+      - wheel-build-cugraph-dgl
+      - wheel-tests-cugraph-dgl
+      - wheel-build-cugraph-pyg
+      - wheel-tests-cugraph-pyg
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
@@ -127,6 +131,36 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-build-cugraph-dgl:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-tests-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-dgl.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
+  wheel-build-cugraph-pyg:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-tests-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-pyg.sh
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -57,3 +57,21 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-tests-cugraph-dgl:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-dgl.sh
+  wheel-tests-cugraph-pyg:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-pyg.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,7 +52,7 @@ repos:
         pass_filenames: false
         additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.5.1
+    rev: v1.8.0
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -143,7 +143,7 @@ You will need to modify the bash scripts to run appopriately for your environmen
 desired training workflow.  The standard sbatch arguments are at the top of the script, such as
 job name, queue, etc.  These will need to be modified for your SLURM cluster.
 
-Next are arguments for the container image (which is currently set to the current DLFW image),
+Next are arguments for the container image (required),
 and directories where the data and outputs are stored.  The directories default to subdirectories
 of the current working directory.  But if there is a high-throughput storage system available,
 using that storage for the samples and datasets is highly recommended.

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -16,7 +16,7 @@
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
 os.environ["CUDF_SPILL"] = "1"
 os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
-os.environ["KVIKIO_NTHREADS"] = "64"
+os.environ["KVIKIO_NTHREADS"] = "8"
 
 import argparse
 import json

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
@@ -36,8 +36,6 @@ export CUDF_SPILL=1
 export LIBCUDF_CUFILE_POLICY="OFF"
 export GPUS_PER_NODE=8
 
-PATCH_CUGRAPH=1
-
 export SCHEDULER_FILE=$SCHEDULER_FILE
 export LOGS_DIR=$LOGS_DIR
 
@@ -60,17 +58,6 @@ else
     ${MG_UTILS_DIR}/run-dask-process.sh workers &
 fi
 
-if [[ $PATCH_CUGRAPH == 1 ]]; then
-    mkdir /opt/cugraph-patch
-    git clone https://github.com/alexbarghi-nv/cugraph -b dlfw-patch-24.01 /opt/cugraph-patch
-
-    rm /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    rm /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-
-fi
-
 echo "properly waiting for workers to connect"
 NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
 handleTimeout 120 python ${MG_UTILS_DIR}/wait_for_workers.py \

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
@@ -18,7 +18,7 @@
 #SBATCH -N 1
 #SBATCH -t 00:25:00 
 
-CONTAINER_IMAGE="/lustre/fsw/rapids/abarghi/dlfw_patched.squash"
+CONTAINER_IMAGE=${CONTAINER_IMAGE:="please_specify_container"}
 SCRIPTS_DIR=$(pwd)
 LOGS_DIR=${LOGS_DIR:=$(pwd)"/logs"}
 SAMPLES_DIR=${SAMPLES_DIR:=$(pwd)/samples}

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -58,7 +58,7 @@ def train(self):
         time_forward = 0.0
         time_backward = 0.0
         time_loader = 0.0
-        time_feature_additional = 0.0
+        time_feature_transfer = 0.0
         start_time = time.perf_counter()
         end_time_backward = start_time
 
@@ -73,7 +73,7 @@ def train(self):
                     loader_time_iter = time.perf_counter() - end_time_backward
                     time_loader += loader_time_iter
 
-                    additional_feature_time_start = time.perf_counter()
+                    time_feature_transfer_start = time.perf_counter()
 
                     num_sampled_nodes = sum(
                         [
@@ -94,9 +94,9 @@ def train(self):
                     num_sampled_edges = extend_tensor(num_sampled_edges, num_layers)
 
                     data = data.to_homogeneous().cuda()
-                    additional_feature_time_end = time.perf_counter()
-                    time_feature_additional += (
-                        additional_feature_time_end - additional_feature_time_start
+                    time_feature_transfer_end = time.perf_counter()
+                    time_feature_transfer += (
+                        time_feature_transfer_end - time_feature_transfer_start
                     )
 
                     num_batches += 1
@@ -113,6 +113,9 @@ def train(self):
                         logger.info(f"time forward: {time_forward_iter}")
                         logger.info(f"time backward: {time_backward_iter}")
                         logger.info(f"loader time: {loader_time_iter}")
+                        logger.info(
+                            f"feature transfer time: {time_feature_transfer / num_batches}"
+                        )
                         logger.info(f"total time: {total_time_iter}")
 
                     y_true = data.y
@@ -253,7 +256,8 @@ def train(self):
         stats = {
             "Accuracy": float(acc_sum / (i) * 100.0) if self.rank == 0 else 0.0,
             "# Batches": num_batches,
-            "Loader Time": time_loader + time_feature_additional,
+            "Loader Time": time_loader,
+            "Feature Transfer Time": time_feature_transfer,
             "Forward Time": time_forward,
             "Backward Time": time_backward,
         }