From a53ab34b804af2865d2d210b801a759d2ca29bc6 Mon Sep 17 00:00:00 2001 From: Naim <110031745+naimnv@users.noreply.github.com> Date: Thu, 21 Sep 2023 19:39:18 +0200 Subject: [PATCH] Refactor python code for similarity algos to use latest CAPI (#3828) This PR - refactors python code for similarity algorithms (Jaccard, Sorensen, Overlap) to use latest CAPI - removes legacy cuda c/c++ code and python wrapper around legacy code - update CAPI tests - remove and update python tests Closes #2546 Closes #2547 Closes #2548 Closes #2549 Closes #2749 Authors: - Naim (https://github.com/naimnv) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3828 --- cpp/CMakeLists.txt | 2 - cpp/src/link_prediction/legacy/jaccard.cu | 429 ------------------ cpp/src/link_prediction/legacy/overlap.cu | 425 ----------------- cpp/tests/c_api/mg_similarity_test.c | 51 ++- cpp/tests/c_api/similarity_test.c | 57 +-- python/cugraph/CMakeLists.txt | 1 - .../cugraph/community/induced_subgraph.py | 9 +- .../cugraph/dask/link_prediction/jaccard.py | 2 +- .../cugraph/dask/link_prediction/overlap.py | 2 +- .../cugraph/dask/link_prediction/sorensen.py | 2 +- .../cugraph/cugraph/experimental/__init__.py | 32 +- .../experimental/link_prediction/__init__.py | 13 - .../experimental/link_prediction/jaccard.py | 255 ----------- .../experimental/link_prediction/overlap.py | 223 --------- .../experimental/link_prediction/sorensen.py | 221 --------- .../cugraph/link_prediction/CMakeLists.txt | 22 - .../cugraph/link_prediction/__init__.py | 23 +- .../cugraph/link_prediction/jaccard.pxd | 35 -- .../cugraph/link_prediction/jaccard.py | 208 ++++++--- .../link_prediction/jaccard_wrapper.pyx | 155 ------- .../cugraph/link_prediction/overlap.pxd | 35 -- .../cugraph/link_prediction/overlap.py | 212 +++++++-- .../link_prediction/overlap_wrapper.pyx | 142 ------ .../cugraph/link_prediction/sorensen.py | 223 ++++++--- .../cugraph/link_prediction/wjaccard.py | 100 ++-- .../cugraph/link_prediction/woverlap.py | 76 ++-- .../cugraph/link_prediction/wsorensen.py | 78 ++-- .../cugraph/cugraph/sampling/random_walks.py | 9 +- .../tests/link_prediction/test_jaccard.py | 315 +++++++------ .../tests/link_prediction/test_overlap.py | 152 ++++--- .../tests/link_prediction/test_sorensen.py | 252 ++++++---- .../tests/link_prediction/test_wjaccard.py | 177 -------- .../tests/link_prediction/test_woverlap.py | 171 ------- .../tests/link_prediction/test_wsorensen.py | 181 -------- python/pylibcugraph/pylibcugraph/__init__.py | 7 + .../pylibcugraph/experimental/__init__.py | 19 +- .../pylibcugraph/jaccard_coefficients.pyx | 12 +- .../pylibcugraph/overlap_coefficients.pyx | 10 +- .../pylibcugraph/sorensen_coefficients.pyx | 10 +- 39 files changed, 1129 insertions(+), 3219 deletions(-) delete mode 100644 cpp/src/link_prediction/legacy/jaccard.cu delete mode 100644 cpp/src/link_prediction/legacy/overlap.cu delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/__init__.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/jaccard.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/overlap.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/sorensen.py delete mode 100644 python/cugraph/cugraph/link_prediction/CMakeLists.txt delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard.pxd delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx delete mode 100644 python/cugraph/cugraph/link_prediction/overlap.pxd delete mode 100644 python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_woverlap.py delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a6c26ee3b91..0d7bd86075d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -196,8 +196,6 @@ set(CUGRAPH_SOURCES src/utilities/path_retrieval.cu src/structure/legacy/graph.cu src/linear_assignment/legacy/hungarian.cu - src/link_prediction/legacy/jaccard.cu - src/link_prediction/legacy/overlap.cu src/link_prediction/jaccard_sg.cu src/link_prediction/sorensen_sg.cu src/link_prediction/overlap_sg.cu diff --git a/cpp/src/link_prediction/legacy/jaccard.cu b/cpp/src/link_prediction/legacy/jaccard.cu deleted file mode 100644 index d0b240e3c77..00000000000 --- a/cpp/src/link_prediction/legacy/jaccard.cu +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include - -namespace cugraph { -namespace detail { - -// Volume of neighboors (*weight_s) -template -__global__ void jaccard_row_sum( - vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work) -{ - vertex_t row; - edge_t start, end, length; - weight_t sum; - - for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - - // compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) work[row] = sum; - } else { - work[row] = static_cast(length); - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -template -__global__ void jaccard_is(vertex_t n, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, j, Ni, Nj; - vertex_t row, col; - vertex_t ref, cur, ref_col, cur_col, match; - weight_t ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[j] = work[row] + work[col]; - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[j], ref_val); } - } - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// Using list of node pairs -template -__global__ void jaccard_is_pairs(edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, idx, Ni, Nj, match; - vertex_t row, col, ref, cur, ref_col, cur_col; - weight_t ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[idx] = work[row] + work[col]; - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[idx], ref_val); } - } - } -} - -// Jaccard weights (*weight) -template -__global__ void jaccard_jw(edge_t e, - weight_t const* weight_i, - weight_t const* weight_s, - weight_t* weight_j) -{ - edge_t j; - weight_t Wi, Ws, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Ws = weight_s[j]; - Wu = Ws - Wi; - weight_j[j] = (Wi / Wu); - } -} - -template -int jaccard(vertex_t n, - edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - rmm::cuda_stream_view stream_view; - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - jaccard_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - - thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - jaccard_is<<>>( - n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - jaccard_jw - <<>>(e, weight_i, weight_s, weight_j); - - return 0; -} - -template -int jaccard_pairs(vertex_t n, - edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - jaccard_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - - // NOTE: initilized weight_i vector with 0.0 - // fill(num_pairs, weight_i, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - jaccard_is_pairs<<>>( - num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (edge_t)CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - jaccard_jw - <<>>(num_pairs, weight_i, weight_s, weight_j); - - return 0; -} -} // namespace detail - -template -void jaccard(legacy::GraphCSRView const& graph, WT const* weights, WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - - rmm::device_vector weight_i(graph.number_of_edges); - rmm::device_vector weight_s(graph.number_of_edges); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::jaccard(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::jaccard(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template -void jaccard_list(legacy::GraphCSRView const& graph, - WT const* weights, - ET num_pairs, - VT const* first, - VT const* second, - WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL"); - CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL"); - - rmm::device_vector weight_i(num_pairs, WT{0.0}); - rmm::device_vector weight_s(num_pairs); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::jaccard_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::jaccard_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template void jaccard(legacy::GraphCSRView const&, - float const*, - float*); -template void jaccard( - legacy::GraphCSRView const&, double const*, double*); -template void jaccard(legacy::GraphCSRView const&, - float const*, - float*); -template void jaccard( - legacy::GraphCSRView const&, double const*, double*); -template void jaccard_list( - legacy::GraphCSRView const&, - float const*, - int32_t, - int32_t const*, - int32_t const*, - float*); -template void jaccard_list( - legacy::GraphCSRView const&, - double const*, - int32_t, - int32_t const*, - int32_t const*, - double*); -template void jaccard_list( - legacy::GraphCSRView const&, - float const*, - int64_t, - int64_t const*, - int64_t const*, - float*); -template void jaccard_list( - legacy::GraphCSRView const&, - double const*, - int64_t, - int64_t const*, - int64_t const*, - double*); - -} // namespace cugraph diff --git a/cpp/src/link_prediction/legacy/overlap.cu b/cpp/src/link_prediction/legacy/overlap.cu deleted file mode 100644 index 67d7cd5e4c6..00000000000 --- a/cpp/src/link_prediction/legacy/overlap.cu +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace cugraph { -namespace detail { - -// Volume of neighboors (*weight_s) -// TODO: Identical kernel to jaccard_row_sum!! -template -__global__ void overlap_row_sum( - vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work) -{ - vertex_t row; - edge_t start, end, length; - weight_t sum; - - for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - - // compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) work[row] = sum; - } else { - work[row] = static_cast(length); - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// TODO: Identical kernel to jaccard_row_sum!! -template -__global__ void overlap_is(vertex_t n, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, j, Ni, Nj; - vertex_t row, col; - vertex_t ref, cur, ref_col, cur_col, match; - weight_t ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[j] = min(work[row], work[col]); - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[j], ref_val); } - } - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// Using list of node pairs -// NOTE: NOT the same as jaccard -template -__global__ void overlap_is_pairs(edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, idx, Ni, Nj, match; - vertex_t row, col, ref, cur, ref_col, cur_col; - weight_t ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[idx] = min(work[row], work[col]); - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[idx], ref_val); } - } - } -} - -// Overlap weights (*weight) -template -__global__ void overlap_jw(edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - edge_t j; - weight_t Wi, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Wu = weight_s[j]; - weight_j[j] = (Wi / Wu); - } -} - -template -int overlap(vertex_t n, - edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - overlap_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - fill(e, weight_i, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - overlap_is - <<>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - overlap_jw - <<>>(e, csrPtr, csrInd, weight_i, weight_s, weight_j); - - return 0; -} - -template -int overlap_pairs(vertex_t n, - edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - // launch kernel - - overlap_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - fill(num_pairs, weight_i, weight_t{0.0}); - // setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - overlap_is_pairs<<>>( - num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - // launch kernel - - overlap_jw - <<>>(num_pairs, csrPtr, csrInd, weight_i, weight_s, weight_j); - - return 0; -} -} // namespace detail - -template -void overlap(legacy::GraphCSRView const& graph, WT const* weights, WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - - rmm::device_vector weight_i(graph.number_of_edges); - rmm::device_vector weight_s(graph.number_of_edges); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::overlap(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::overlap(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template -void overlap_list(legacy::GraphCSRView const& graph, - WT const* weights, - ET num_pairs, - VT const* first, - VT const* second, - WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL"); - CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL"); - - rmm::device_vector weight_i(num_pairs); - rmm::device_vector weight_s(num_pairs); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::overlap_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::overlap_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template void overlap(legacy::GraphCSRView const&, - float const*, - float*); -template void overlap( - legacy::GraphCSRView const&, double const*, double*); -template void overlap(legacy::GraphCSRView const&, - float const*, - float*); -template void overlap( - legacy::GraphCSRView const&, double const*, double*); -template void overlap_list( - legacy::GraphCSRView const&, - float const*, - int32_t, - int32_t const*, - int32_t const*, - float*); -template void overlap_list( - legacy::GraphCSRView const&, - double const*, - int32_t, - int32_t const*, - int32_t const*, - double*); -template void overlap_list( - legacy::GraphCSRView const&, - float const*, - int64_t, - int64_t const*, - int64_t const*, - float*); -template void overlap_list( - legacy::GraphCSRView const&, - double const*, - int64_t, - int64_t const*, - int64_t const*, - double*); - -} // namespace cugraph diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 0ac160245ab..336f6c50519 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -160,15 +160,16 @@ int test_jaccard(const cugraph_resource_handle_t* handle) int test_weighted_jaccard(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(handle, h_src, @@ -216,15 +217,16 @@ int test_sorensen(const cugraph_resource_handle_t* handle) int test_weighted_sorensen(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(handle, h_src, @@ -272,15 +274,16 @@ int test_overlap(const cugraph_resource_handle_t* handle) int test_weighted_overlap(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(handle, h_src, diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index 20af3f3eccd..52f849ccd28 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -161,15 +161,16 @@ int test_jaccard() int test_weighted_jaccard() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -215,15 +216,16 @@ int test_sorensen() int test_weighted_sorensen() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(h_src, h_dst, @@ -269,15 +271,16 @@ int test_overlap() int test_weighted_overlap() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(h_src, h_dst, @@ -301,8 +304,8 @@ int main(int argc, char** argv) result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); - // result |= RUN_TEST(test_weighted_jaccard); - // result |= RUN_TEST(test_weighted_sorensen); - // result |= RUN_TEST(test_weighted_overlap); + result |= RUN_TEST(test_weighted_jaccard); + result |= RUN_TEST(test_weighted_sorensen); + result |= RUN_TEST(test_weighted_overlap); return result; } diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt index f3b28623b12..ecfcb9b219f 100644 --- a/python/cugraph/CMakeLists.txt +++ b/python/cugraph/CMakeLists.txt @@ -89,7 +89,6 @@ add_subdirectory(cugraph/dask/structure) add_subdirectory(cugraph/internals) add_subdirectory(cugraph/layout) add_subdirectory(cugraph/linear_assignment) -add_subdirectory(cugraph/link_prediction) add_subdirectory(cugraph/structure) add_subdirectory(cugraph/tree) add_subdirectory(cugraph/utilities) diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py index 29fe2f29c1e..3a901199b01 100644 --- a/python/cugraph/cugraph/community/induced_subgraph.py +++ b/python/cugraph/cugraph/community/induced_subgraph.py @@ -25,11 +25,10 @@ ) from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index b3d688584a0..218e6206fc3 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index c47aeef3c72..5540be28fd1 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index bb5a3f44f39..24295ac330c 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index b96b760e634..2309a529047 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -48,30 +48,22 @@ experimental_warning_wrapper(EXPERIMENTAL__find_bicliques) ) -from cugraph.experimental.link_prediction.jaccard import ( - EXPERIMENTAL__jaccard, - EXPERIMENTAL__jaccard_coefficient, -) +from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler -jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard) -jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient) +BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler) -from cugraph.experimental.link_prediction.sorensen import ( - EXPERIMENTAL__sorensen, - EXPERIMENTAL__sorensen_coefficient, -) -sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen) -sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient) +from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient -from cugraph.experimental.link_prediction.overlap import ( - EXPERIMENTAL__overlap, - EXPERIMENTAL__overlap_coefficient, -) +jaccard = promoted_experimental_warning_wrapper(jaccard) +jaccard_coefficient = promoted_experimental_warning_wrapper(jaccard_coefficient) -overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap) -overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient) +from cugraph.link_prediction.sorensen import sorensen, sorensen_coefficient -from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler +sorensen = promoted_experimental_warning_wrapper(sorensen) +sorensen_coefficient = promoted_experimental_warning_wrapper(sorensen_coefficient) -BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler) +from cugraph.link_prediction.overlap import overlap, overlap_coefficient + +overlap = promoted_experimental_warning_wrapper(overlap) +overlap_coefficient = promoted_experimental_warning_wrapper(overlap_coefficient) diff --git a/python/cugraph/cugraph/experimental/link_prediction/__init__.py b/python/cugraph/cugraph/experimental/link_prediction/__init__.py deleted file mode 100644 index 081b2ae8260..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py b/python/cugraph/cugraph/experimental/link_prediction/jaccard.py deleted file mode 100644 index 2eba73b3824..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings - -from pylibcugraph.experimental import ( - jaccard_coefficients as pylibcugraph_jaccard_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__jaccard(G, vertex_pair=None, use_weight=False): - """ - Compute the Jaccard similarity between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context - of graphs, the neighborhood of a vertex is seen as a set. The Jaccard - similarity weight of each edge represents the strength of connection - between vertices based on the relative similarity of their neighbors. If - first is specified but second is not, or vice versa, an exception will be - thrown. - - NOTE: If the vertex_pair parameter is not specified then the behavior - of cugraph.jaccard is different from the behavior of - networkx.jaccard_coefficient. - - cugraph.jaccard, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the jaccard coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - networkx.jaccard_coefficient, in the absence of a specified vertex - pair list, will return an upper triangular dense matrix, excluding - the diagonal as well as vertex pairs that are directly connected - by an edge in the graph, of jaccard coefficients. Technically, networkx - returns a lazy iterator across this upper triangular matrix where - the actual jaccard coefficient is computed when the iterator is - dereferenced. Computing a dense matrix of results is not feasible - if the number of vertices in the graph is large (100,000 vertices - would result in 4.9 billion values in that iterator). - - If your graph is small enough (or you have enough memory and patience) - you can get the interesting (non-zero) values that are part of the networkx - solution by doing the following: - - >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> pairs = G.get_two_hop_neighbors() - >>> df = cugraph.jaccard(G, pairs) - - But please remember that cugraph will fill the dataframe with the entire - solution you request, so you'll need enough memory to store the 2-hop - neighborhood dataframe. - - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Jaccard weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['jaccard_coeff'] : cudf.Series - The computed jaccard coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import jaccard as exp_jaccard - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_jaccard(G) - - """ - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["jaccard_coeff"] = cudf.Series(jaccard_coeff) - - return df - - -def EXPERIMENTAL__jaccard_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `jaccard` - - Parameters - ---------- - graph : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Jaccard weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - ddf['first']: dask_cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - ddf['second']: dask_cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - ddf['jaccard_coeff']: dask_cudf.Series - The computed jaccard coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_jaccard_coefficient(G) - - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__jaccard(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="jaccard_coeff", src="first", dst="second" - ) - - return df diff --git a/python/cugraph/cugraph/experimental/link_prediction/overlap.py b/python/cugraph/cugraph/experimental/link_prediction/overlap.py deleted file mode 100644 index 0981ced4835..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/overlap.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings - -from pylibcugraph.experimental import ( - overlap_coefficients as pylibcugraph_overlap_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__overlap_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `overlap` - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the Overlap coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the overlap coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the overlap weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - ddf['first']: dask_cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - ddf['second']: dask_cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - ddf['overlap_coeff']: dask_cudf.Series - The computed overlap coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import overlap_coefficient as exp_overlap_coefficient - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_overlap_coefficient(G) - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__overlap(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="overlap_coeff", src="first", dst="second" - ) - - return df - - -def EXPERIMENTAL__overlap(G, vertex_pair=None, use_weight=False): - """ - Compute the Overlap Coefficient between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the - context of graphs, the neighborhood of a vertex is seen as a set. The - Overlap Coefficient weight of each edge represents the strength of - connection between vertices based on the relative similarity of their - neighbors. If first is specified but second is not, or vice versa, an - exception will be thrown. - - cugraph.overlap, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the overlap coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - adjacency list will be computed if not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the overlap coefficient is computed for the - given vertex pairs, else, it is computed for all vertex pairs. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Overlap coefficients. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['overlap_coeff'] : cudf.Series - The computed overlap coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import overlap as exp_overlap - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_overlap(G) - - """ - - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, overlap_coeff = pylibcugraph_overlap_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["overlap_coeff"] = cudf.Series(overlap_coeff) - - return df diff --git a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py b/python/cugraph/cugraph/experimental/link_prediction/sorensen.py deleted file mode 100644 index ed27e4813d3..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings -from pylibcugraph.experimental import ( - sorensen_coefficients as pylibcugraph_sorensen_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__sorensen(G, vertex_pair=None, use_weight=False): - """ - Compute the Sorensen coefficient between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. - If first is specified but second is not, or vice versa, an exception will - be thrown. - - cugraph.sorensen, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the sorensen coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the Sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the Sorensen coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Sorensen index. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['sorensen_coeff'] : cudf.Series - The computed sorensen coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import sorensen as exp_sorensen - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_sorensen(G) - - """ - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["sorensen_coeff"] = cudf.Series(sorensen_coeff) - - return df - - -def EXPERIMENTAL__sorensen_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `sorensen` - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the sorensen coefficient for all - adjacent vertices in the graph. - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Sorensen weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['sorensen_coeff'] : cudf.Series - The computed sorensen coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import sorensen_coefficient as exp_sorensen_coef - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_sorensen_coef(G) - - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__sorensen(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="sorensen_coeff", src="first", dst="second" - ) - - return df diff --git a/python/cugraph/cugraph/link_prediction/CMakeLists.txt b/python/cugraph/cugraph/link_prediction/CMakeLists.txt deleted file mode 100644 index a117cf9afc3..00000000000 --- a/python/cugraph/cugraph/link_prediction/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources jaccard_wrapper.pyx overlap_wrapper.pyx) -set(linked_libraries cugraph::cugraph) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX link_prediction_ - ASSOCIATED_TARGETS cugraph -) diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index a6911d3b8ae..a8517ee7c0f 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,13 +11,26 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from cugraph.utilities.api_tools import deprecated_warning_wrapper from cugraph.link_prediction.jaccard import jaccard from cugraph.link_prediction.jaccard import jaccard_coefficient + +from cugraph.link_prediction.sorensen import sorensen +from cugraph.link_prediction.sorensen import sorensen_coefficient + from cugraph.link_prediction.overlap import overlap +from cugraph.link_prediction.overlap import overlap_coefficient + +# To be deprecated from cugraph.link_prediction.wjaccard import jaccard_w + +jaccard_w = deprecated_warning_wrapper(jaccard_w) + from cugraph.link_prediction.woverlap import overlap_w + +overlap_w = deprecated_warning_wrapper(overlap_w) + from cugraph.link_prediction.wsorensen import sorensen_w -from cugraph.link_prediction.jaccard import jaccard_coefficient -from cugraph.link_prediction.sorensen import sorensen_coefficient -from cugraph.link_prediction.sorensen import sorensen -from cugraph.link_prediction.overlap import overlap_coefficient + +sorensen_w = deprecated_warning_wrapper(sorensen_w) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.pxd b/python/cugraph/cugraph/link_prediction/jaccard.pxd deleted file mode 100644 index 9e8c82ec3d8..00000000000 --- a/python/cugraph/cugraph/link_prediction/jaccard.pxd +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.structure.graph_primtypes cimport * - - -cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": - - cdef void jaccard[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - WT *result) except + - - cdef void jaccard_list[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - ET num_pairs, - const VT *first, - const VT *second, - WT *result) except + diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 334d57f9d80..27bfa58e6b0 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -11,16 +11,54 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf -from cugraph.link_prediction import jaccard_wrapper from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings +from typing import Union, Iterable + +from pylibcugraph import ( + jaccard_coefficients as pylibcugraph_jaccard_coefficients, +) +from pylibcugraph import ResourceHandle + +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair -def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): +def jaccard( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -36,13 +74,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): of cugraph.jaccard is different from the behavior of networkx.jaccard_coefficient. - This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - cugraph.jaccard, in the absence of a specified vertex pair list, will - use the edges of the graph to construct a vertex pair list and will - return the jaccard coefficient for those vertex pairs. + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets networkx.jaccard_coefficient, in the absence of a specified vertex pair list, will return an upper triangular dense matrix, excluding @@ -59,9 +95,9 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): solution by doing the following: >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> pairs = G.get_two_hop_neighbors() - >>> df = cugraph.jaccard(G, pairs) + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> pairs = input_graph.get_two_hop_neighbors() + >>> df = cugraph.jaccard(input_graph, pairs) But please remember that cugraph will fill the dataframe with the entire solution you request, so you'll need enough memory to store the 2-hop @@ -72,10 +108,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -84,9 +121,20 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): current implementation computes the jaccard coefficient for all adjacent vertices in the graph. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + Returns ------- @@ -99,7 +147,7 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): df['first'] : cudf.Series The first vertex ID of each pair (will be identical to first if specified). df['second'] : cudf.Series - the second vertex ID of each pair (will be identical to second if + The second vertex ID of each pair (will be identical to second if specified). df['jaccard_coeff'] : cudf.Series The computed Jaccard coefficient between the first and the second @@ -108,65 +156,101 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.jaccard(G) + >>> from cugraph import jaccard + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = jaccard(input_graph) """ if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") - if type(vertex_pair) == cudf.DataFrame: + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") + raise ValueError("vertex_pair must be a cudf Dataframe") - df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair) + first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["jaccard_coeff"] = cudf.Series(jaccard_coeff) return df -def jaccard_coefficient(G, ebunch=None, do_expensive_check=True): +def jaccard_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ For NetworkX Compatability. See `jaccard` - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - Parameters ---------- - graph : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. - ebunch : cudf.DataFrame, optional (default=None) + This implementation only supports undirected, non-multi Graphs. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -188,10 +272,18 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate + >>> from cugraph import jaccard_coefficient >>> G = karate.get_graph(download=True) - >>> df = cugraph.jaccard_coefficient(G) + >>> df = jaccard_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx deleted file mode 100644 index e66d8bf0b5c..00000000000 --- a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.link_prediction.jaccard cimport jaccard as c_jaccard -from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list -from cugraph.structure.graph_primtypes cimport * -from cugraph.structure import graph_primtypes_wrapper -from libc.stdint cimport uintptr_t -import cudf -import numpy as np - - -def jaccard(input_graph, weights_arr=None, vertex_pair=None): - """ - Call jaccard or jaccard_list - """ - offsets = None - indices = None - - if input_graph.adjlist: - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, - input_graph.adjlist.indices], [np.int32]) - elif input_graph.transposedadjlist: - # - # NOTE: jaccard ONLY operates on an undirected graph, so CSR and CSC should be - # equivalent. The undirected check has already happened, so we'll just use - # the CSC as if it were CSR. - # - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, - input_graph.transposedadjlist.indices], [np.int32]) - else: - input_graph.view_adj_list() - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, - input_graph.adjlist.indices], [np.int32]) - - num_verts = input_graph.number_of_vertices() - num_edges = input_graph.number_of_edges(directed_edges=True) - - first = None - second = None - - cdef uintptr_t c_result_col = NULL - cdef uintptr_t c_first_col = NULL - cdef uintptr_t c_second_col = NULL - cdef uintptr_t c_src_index_col = NULL - cdef uintptr_t c_dst_index_col = NULL - cdef uintptr_t c_weights = NULL - cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0] - cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0] - - cdef GraphCSRView[int,int,float] graph_float - cdef GraphCSRView[int,int,double] graph_double - - weight_type = np.float32 - - if weights_arr is not None: - [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64]) - c_weights = weights.__cuda_array_interface__['data'][0] - weight_type = weights.dtype - - if type(vertex_pair) == cudf.DataFrame: - result_size = len(vertex_pair) - result = cudf.Series(np.ones(result_size, dtype=weight_type)) - c_result_col = result.__cuda_array_interface__['data'][0] - - df = cudf.DataFrame() - df['jaccard_coeff'] = result - - cols = vertex_pair.columns.to_list() - first = vertex_pair[cols[0]].astype(np.int32) - second = vertex_pair[cols[1]].astype(np.int32) - - # FIXME: multi column support - df['first'] = first - df['second'] = second - c_first_col = first.__cuda_array_interface__['data'][0] - c_second_col = second.__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - graph_float = GraphCSRView[int,int,float](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_jaccard_list[int,int,float](graph_float, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - else: - graph_double = GraphCSRView[int,int,double](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_jaccard_list[int,int,double](graph_double, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - - return df - else: - # error check performed in jaccard.py - assert vertex_pair is None - - df = cudf.DataFrame() - df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype)) - df['second'] = indices - - c_src_index_col = df['first'].__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32), - nan_as_null=False) - c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0] - - graph_float = GraphCSRView[int,int,float](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_jaccard[int,int,float](graph_float, - c_weights, - c_result_col) - - graph_float.get_source_indices(c_src_index_col) - else: - df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64), - nan_as_null=False) - c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0] - - graph_double = GraphCSRView[int,int,double](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_jaccard[int,int,double](graph_double, - c_weights, - c_result_col) - - graph_double.get_source_indices(c_src_index_col) - - return df diff --git a/python/cugraph/cugraph/link_prediction/overlap.pxd b/python/cugraph/cugraph/link_prediction/overlap.pxd deleted file mode 100644 index f0654472587..00000000000 --- a/python/cugraph/cugraph/link_prediction/overlap.pxd +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.structure.graph_primtypes cimport * - - -cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": - - cdef void overlap[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - WT *result) except + - - cdef void overlap_list[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - ET num_pairs, - const VT *first, - const VT *second, - WT *result) except + diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index 9bb7b76b0ca..3a25526679c 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -11,28 +11,120 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.link_prediction import overlap_wrapper -import cudf from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings +from typing import Union, Iterable + +from pylibcugraph import ( + overlap_coefficients as pylibcugraph_overlap_coefficients, +) +from pylibcugraph import ResourceHandle + +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes -def overlap_coefficient(G, ebunch=None, do_expensive_check=True): + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + +def overlap_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ - For NetworkX Compatability. See `overlap` + Compute overlap coefficient. + + Parameters + ---------- + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi edge Graph. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the overlap weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['overlap_coeff']: dask_cudf.Series + The computed overlap coefficient between the first and the second + vertex ID. + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import overlap_coefficient + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = overlap_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) + # FIXME: What is the logic behind this since the docstrings mention that 'G' and + # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? if isNx is True and ebunch is not None: vertex_pair = cudf.DataFrame(ebunch) @@ -46,7 +138,12 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=True): return df -def overlap(input_graph, vertex_pair=None, do_expensive_check=True): +def overlap( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -58,25 +155,39 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True): neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. + cugraph.overlap, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets Parameters ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - adjacency list will be computed if not already present. + as an edge list. The adjacency list will be computed if not already + present. + This implementation only supports undirected, non-multi edge Graph. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + Returns ------- @@ -98,35 +209,62 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.overlap(G) + >>> from cugraph import overlap + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = overlap(input_graph) """ if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(vertex_pair) == cudf.DataFrame: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: raise ValueError("vertex_pair must be a cudf dataframe") - df = overlap_wrapper.overlap(input_graph, None, vertex_pair) + first, second, overlap_coeff = pylibcugraph_overlap_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["overlap_coeff"] = cudf.Series(overlap_coeff) return df diff --git a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx deleted file mode 100644 index 0f61460a72f..00000000000 --- a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.link_prediction.overlap cimport overlap as c_overlap -from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list -from cugraph.structure.graph_primtypes cimport * -from cugraph.structure import graph_primtypes_wrapper -from libc.stdint cimport uintptr_t -import cudf -import numpy as np - - -def overlap(input_graph, weights_arr=None, vertex_pair=None): - """ - Call overlap or overlap_list - """ - - if not input_graph.adjlist: - input_graph.view_adj_list() - - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32]) - - num_verts = input_graph.number_of_vertices() - num_edges = input_graph.number_of_edges(directed_edges=True) - - first = None - second = None - - cdef uintptr_t c_result_col = NULL - cdef uintptr_t c_first_col = NULL - cdef uintptr_t c_second_col = NULL - cdef uintptr_t c_src_index_col = NULL - cdef uintptr_t c_dst_index_col = NULL - cdef uintptr_t c_weights = NULL - cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0] - cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0] - - cdef GraphCSRView[int,int,float] graph_float - cdef GraphCSRView[int,int,double] graph_double - - weight_type = np.float32 - - if weights_arr is not None: - [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64]) - c_weights = weights.__cuda_array_interface__['data'][0] - weight_type = weights.dtype - - if type(vertex_pair) == cudf.DataFrame: - result_size = len(vertex_pair) - result = cudf.Series(np.ones(result_size, dtype=np.float32)) - c_result_col = result.__cuda_array_interface__['data'][0] - - df = cudf.DataFrame() - df['overlap_coeff'] = result - - cols = vertex_pair.columns.to_list() - first = vertex_pair[cols[0]] - second = vertex_pair[cols[1]] - - # FIXME: multi column support - df['first'] = first - df['second'] = second - c_first_col = first.__cuda_array_interface__['data'][0] - c_second_col = second.__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - graph_float = GraphCSRView[int,int,float](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_overlap_list[int,int,float](graph_float, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - else: - graph_double = GraphCSRView[int,int,double](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_overlap_list[int,int,double](graph_double, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - - return df - else: - # error check performed in overlap.py - assert vertex_pair is None - - df = cudf.DataFrame() - df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype)) - df['second'] = indices - - c_src_index_col = df['first'].__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32), - nan_as_null=False) - c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0] - - graph_float = GraphCSRView[int,int,float](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_overlap[int,int,float](graph_float, - c_weights, - c_result_col) - - graph_float.get_source_indices(c_src_index_col) - else: - df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64), - nan_as_null=False) - c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0] - - graph_double = GraphCSRView[int,int,double](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_overlap[int,int,double](graph_double, - c_weights, - c_result_col) - - graph_double.get_source_indices(c_src_index_col) - - return df diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index ef2bd8d674d..a8ccced1e68 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -11,17 +11,54 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf -from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings +from typing import Union, Iterable + +from pylibcugraph import ( + sorensen_coefficients as pylibcugraph_sorensen_coefficients, +) +from pylibcugraph import ResourceHandle + +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair -def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): +def sorensen( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -30,22 +67,20 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): If first is specified but second is not, or vice versa, an exception will be thrown. - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - cugraph.sorensen, in the absence of a specified vertex pair list, will - use the edges of the graph to construct a vertex pair list and will - return the sorensen coefficient for those vertex pairs. + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets Parameters ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + as an edge list. The adjacency list will be computed if not already + present. + + This implementation only supports undirected, non-multi edge Graph. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -54,9 +89,18 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): current implementation computes the Sorensen coefficient for all adjacent vertices in the graph. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. Returns ------- @@ -67,79 +111,112 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): pairs. df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified) - + The first vertex ID of each pair (will be identical to first if specified). df['second'] : cudf.Series The second vertex ID of each pair (will be identical to second if - specified) - + specified). df['sorensen_coeff'] : cudf.Series - The computed Sorensen coefficient between the first and the second + The computed sorensen coefficient between the first and the second vertex ID. Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.sorensen(G) + >>> from cugraph import sorensen + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = sorensen(input_graph) """ if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: raise ValueError("vertex_pair must be a cudf dataframe") - df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair) - df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff) - df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True) + first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) + if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["sorensen_coeff"] = cudf.Series(sorensen_coeff) return df -def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): +def sorensen_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ - For NetworkX Compatability. See `sorensen` - - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. + Compute sorensen coefficient. Parameters ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - ebunch : cudf.DataFrame, optional (default=None) + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the sorensen coefficient for all - adjacent vertices in the graph. + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -152,7 +229,7 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): df['first'] : cudf.Series The first vertex ID of each pair (will be identical to first if specified). df['second'] : cudf.Series - the second vertex ID of each pair (will be identical to second if + The second vertex ID of each pair (will be identical to second if specified). df['sorensen_coeff'] : cudf.Series The computed Sorensen coefficient between the first and the second @@ -161,14 +238,24 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.sorensen_coefficient(G) + >>> from cugraph import sorensen_coefficient + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = sorensen_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) + # FIXME: What is the logic behind this since the docstrings mention that 'G' and + # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? if isNx is True and ebunch is not None: vertex_pair = cudf.DataFrame(ebunch) diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index e3486473fe5..ec538bbc0ed 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -11,13 +11,45 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper +from cugraph.link_prediction import jaccard import cudf -from cugraph.utilities import renumber_vertex_pair +import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional -def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + +def jaccard_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -55,9 +87,13 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): vertices. If provided, the jaccard coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -95,47 +131,9 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): >>> df = cugraph.jaccard_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - # The 'vertex' column of the cudf 'weights' also needs to be renumbered - # if the graph was renumbered - vertex_size = input_graph.vertex_column_size() - # single-column vertices i.e only one src and dst columns - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - # multi-column vertices i.e more than one src and dst columns - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - - jaccard_weights = weights["weight"] - df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + "jaccard_w is deprecated. To compute weighted jaccard, please use " + "jaccard(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, FutureWarning) + return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index d7ebc5fc684..5f43ad0670b 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -11,12 +11,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.link_prediction import overlap_wrapper +from cugraph.link_prediction import overlap import cudf -from cugraph.utilities import renumber_vertex_pair +import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional -def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +def overlap_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -55,9 +69,13 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -96,43 +114,9 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): ... len(weights['vertex']))] >>> df = cugraph.overlap_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - vertex_size = input_graph.vertex_column_size() - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - - overlap_weights = weights["weight"] - - overlap_weights = overlap_weights.astype("float32") - - df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + " overlap_w is deprecated. To compute weighted overlap, please use " + "overlap(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, FutureWarning) + return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index 8337b4602de..ff502b36837 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -11,13 +11,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper +from cugraph.link_prediction import sorensen import cudf -from cugraph.utilities import renumber_vertex_pair +import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional -def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +def sorensen_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Sorensen similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -51,9 +64,13 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): vertices. If provided, the sorensen coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -93,44 +110,9 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): >>> df = cugraph.sorensen_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - vertex_size = input_graph.vertex_column_size() - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - jaccard_weights = weights["weight"] - df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair) - df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff) - df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + "sorensen_w is deprecated. To compute weighted sorensen, please use " + "sorensen(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, FutureWarning) + return sorensen(input_graph, vertex_pair, use_weight=True) diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index 015c05d1b08..7b04dba82a5 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -25,11 +25,10 @@ from cugraph.utilities.utils import import_optional from typing import Union, Tuple -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index cd883fb88f2..7ce7d263eda 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -11,6 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# FIXME: Can we use global variables for column names instead of hardcoded ones? + import gc import pytest @@ -20,12 +22,19 @@ import cugraph from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS -from cugraph.experimental import jaccard as exp_jaccard -from cudf.testing import assert_series_equal, assert_frame_equal -from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient - +from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal -print("Networkx version : {} ".format(nx.__version__)) +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +JACCARD_COEFF_COL = "jaccard_coeff" +EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" # ============================================================================= @@ -38,65 +47,79 @@ def setup_function(): # ============================================================================= # Helper functions # ============================================================================= -def compare_jaccard_two_hop(G, Gnx, edgevals=True): + + +def compare_jaccard_two_hop(G, Gnx, use_weight=False): """ Compute both cugraph and nx jaccard after extracting the two hop neighbors from G and compare both results """ pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) - nx_pairs = list(pairs.to_records(index=False)) - preds = nx.jaccard_coefficient(Gnx, nx_pairs) - nx_coeff = [] - for u, v, p in preds: - # print(u, " ", v, " ", p) - nx_coeff.append(p) df = cugraph.jaccard(G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental jaccard currently only supports unweighted graphs - df_exp = exp_jaccard(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) - assert len(nx_coeff) == len(df) - for i in range(len(df)): - diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) - assert diff < 1.0e-6 + if not use_weight: + nx_pairs = list(pairs.to_records(index=False)) + preds = nx.jaccard_coefficient(Gnx, nx_pairs) + nx_coeff = [] + for u, v, p in preds: + nx_coeff.append(p) + + assert len(nx_coeff) == len(df) + for i in range(len(df)): + diff = abs(nx_coeff[i] - df[JACCARD_COEFF_COL].iloc[i]) + assert diff < 1.0e-6 + else: + # FIXME: compare results against resultset api + pass -def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None): +def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False): G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=not edgevals) + G = graph_file.get_graph(ignore_weights=not use_weight) # If no vertex_pair is passed as input, 'cugraph.jaccard' will # compute the 'jaccard_similarity' with the two_hop_neighbor of the # entire graph while nx compute with the one_hop_neighbor. For better # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.jaccard' # and pass it as vertex_pair - vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) - vertex_pair = vertex_pair[["first", "second"]] + if isinstance(input_df, cudf.DataFrame): + vertex_pair = input_df.rename( + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} + ) + vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] + else: + vertex_pair = cudf.DataFrame( + columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL], + dtype=G.edgelist.edgelist_df["src"].dtype, + ) # cugraph Jaccard Call - df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair) + df = benchmark_callable( + cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight + ) - df = df.sort_values(["first", "second"]).reset_index(drop=True) + df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) return ( - df["first"].to_numpy(), - df["second"].to_numpy(), - df["jaccard_coeff"].to_numpy(), + df[VERTEX_PAIR_FIRST_COL].to_numpy(), + df[VERTEX_PAIR_SECOND_COL].to_numpy(), + df[JACCARD_COEFF_COL].to_numpy(), ) def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] + sources = M[SRC_COL] + destinations = M[DST_COL] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) @@ -108,7 +131,11 @@ def networkx_call(M, benchmark_callable=None): print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) # Networkx Jaccard Call @@ -144,118 +171,130 @@ def read_csv(request): @pytest.mark.sg -def test_jaccard(read_csv, gpubenchmark): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard(read_csv, gpubenchmark, use_weight): M_cu, M, graph_file = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu) - nx_src, nx_dst, nx_coeff = networkx_call(M) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight + ) + if not use_weight: + nx_src, nx_dst, nx_coeff = networkx_call(M) - # Calculating mismatch - err = 0 - tol = 1.0e-06 + # Calculating mismatch + err = 0 + tol = 1.0e-06 - assert len(cu_coeff) == len(nx_coeff) - for i in range(len(cu_coeff)): - if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: - err += 1 + assert len(cu_coeff) == len(nx_coeff) + for i in range(len(cu_coeff)): + if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: + err += 1 - print("Mismatches: %d" % err) - assert err == 0 + print("Mismatches: %d" % err) + assert err == 0 + else: + G = graph_file.get_graph() + res_w_jaccard = cugraph.jaccard_w(G, vertex_pair=M_cu[[SRC_COL, DST_COL]]) + res_w_jaccard = res_w_jaccard.sort_values( + [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL] + ).reset_index(drop=True) + res_jaccard = cudf.DataFrame() + res_jaccard[VERTEX_PAIR_FIRST_COL] = cu_src + res_jaccard[VERTEX_PAIR_SECOND_COL] = cu_dst + res_jaccard[JACCARD_COEFF_COL] = cu_coeff + assert_frame_equal( + res_w_jaccard, res_jaccard, check_dtype=False, check_like=True + ) + + # FIXME: compare weighted jaccard results against resultset api @pytest.mark.sg -def test_directed_graph_check(read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(read_csv, use_weight): _, M, _ = read_csv cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph(directed=True) - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) + + vertex_pair = cu_M[[SRC_COL, DST_COL]] - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] with pytest.raises(ValueError): - cugraph.jaccard(G1, vertex_pair) + cugraph.jaccard(G1, vertex_pair, use_weight) @pytest.mark.sg def test_nx_jaccard_time(read_csv, gpubenchmark): - _, M, _ = read_csv nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) -@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") -def test_jaccard_edgevals(gpubenchmark, graph_file): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) M_cu = utils.read_csv_file(dataset_path) cu_src, cu_dst, cu_coeff = cugraph_call( - gpubenchmark, netscience, edgevals=True, input_df=M_cu + gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight ) - nx_src, nx_dst, nx_coeff = networkx_call(M) + if not use_weight: + nx_src, nx_dst, nx_coeff = networkx_call(M) - # Calculating mismatch - err = 0 - tol = 1.0e-06 - - assert len(cu_coeff) == len(nx_coeff) - for i in range(len(cu_coeff)): - if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: - err += 1 - - print("Mismatches: %d" % err) - assert err == 0 + # Calculating mismatch + err = 0 + tol = 1.0e-06 + assert len(cu_coeff) == len(nx_coeff) + for i in range(len(cu_coeff)): + if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: + err += 1 -@pytest.mark.sg -def test_jaccard_two_hop(read_csv): - - _, M, graph_file = read_csv - - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - G = graph_file.get_graph(ignore_weights=True) - - compare_jaccard_two_hop(G, Gnx) + print("Mismatches: %d" % err) + assert err == 0 + else: + # FIXME: compare results against resultset api + pass @pytest.mark.sg -def test_jaccard_two_hop_edge_vals(read_csv): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_two_hop(read_csv, use_weight): _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() ) + G = graph_file.get_graph(ignore_weights=not use_weight) - G = graph_file.get_graph() - - compare_jaccard_two_hop(G, Gnx, edgevals=True) + compare_jaccard_two_hop(G, Gnx, use_weight) @pytest.mark.sg def test_jaccard_nx(read_csv): - M_cu, M, _ = read_csv - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) + Gnx = nx.from_pandas_edgelist( + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() + ) nx_j = nx.jaccard_coefficient(Gnx) nv_js = sorted(nx_j, key=len, reverse=True) - ebunch = M_cu.rename(columns={"0": "first", "1": "second"}) - ebunch = ebunch[["first", "second"]] + ebunch = M_cu.rename( + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} + ) + ebunch = ebunch[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch) - cg_j_exp = exp_jaccard_coefficient(Gnx, ebunch=ebunch) assert len(nv_js) > len(cg_j) - assert len(nv_js) > len(cg_j_exp) # FIXME: Nx does a full all-pair Jaccard. # cuGraph does a limited 1-hop Jaccard @@ -263,68 +302,58 @@ def test_jaccard_nx(read_csv): @pytest.mark.sg -def test_jaccard_multi_column(read_csv): - - _, M, _ = read_csv +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_multi_column(graph_file, use_weight): + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] - df_res = cugraph.jaccard(G1, vertex_pair) - df_plc_exp = exp_jaccard(G1, vertex_pair) - - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) - - jaccard_res = df_res["jaccard_coeff"].sort_values().reset_index(drop=True) - jaccard_plc_exp = df_plc_exp["jaccard_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(jaccard_res, jaccard_plc_exp) + df_multi_col_res = cugraph.jaccard(G1, vertex_pair) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight + ) + df_single_col_res = cugraph.jaccard( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[JACCARD_COEFF_COL], expected[JACCARD_COEFF_COL]) @pytest.mark.sg -def test_weighted_exp_jaccard(): +def test_weighted_jaccard(): karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - exp_jaccard(G) - G = karate.get_graph(ignore_weights=True) - use_weight = True - with pytest.raises(ValueError): - exp_jaccard(G, use_weight=use_weight) - - -@pytest.mark.sg -def test_invalid_datasets_jaccard(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(ValueError): - cugraph.jaccard(G) + cugraph.jaccard(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 586d534cd42..e24deaa61ac 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -20,8 +20,19 @@ import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS -from cugraph.experimental import overlap as exp_overlap -from cudf.testing import assert_series_equal, assert_frame_equal +from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal + +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +OVERLAP_COEFF_COL = "overlap_coeff" +EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" # ============================================================================= @@ -35,7 +46,6 @@ def setup_function(): # Helper functions # ============================================================================= def compare_overlap(cu_coeff, cpu_coeff): - assert len(cu_coeff) == len(cpu_coeff) for i in range(len(cu_coeff)): if np.isnan(cpu_coeff[i]): @@ -47,21 +57,21 @@ def compare_overlap(cu_coeff, cpu_coeff): assert diff < 1.0e-6 -def cugraph_call(benchmark_callable, graph_file, pairs, edgevals=False): +def cugraph_call(benchmark_callable, graph_file, pairs, use_weight=False): # Device data G = graph_file.get_graph( - create_using=cugraph.Graph(directed=False), ignore_weights=not edgevals + create_using=cugraph.Graph(directed=False), ignore_weights=not use_weight ) # cugraph Overlap Call df = benchmark_callable(cugraph.overlap, G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental overlap currently only supports unweighted graphs - df_exp = exp_overlap(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) + if use_weight: + res_w_overlap = cugraph.overlap_w(G, vertex_pair=pairs) + assert_frame_equal(res_w_overlap, df, check_dtype=False, check_like=True) - return df["overlap_coeff"].to_numpy() + return df[OVERLAP_COEFF_COL].to_numpy() def intersection(a, b, M): @@ -120,8 +130,10 @@ def read_csv(request): dataset_path = graph_file.get_path() Mnx = utils.read_csv_for_nx(dataset_path) - N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 - M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) + N = max(max(Mnx[SRC_COL]), max(Mnx[DST_COL])) + 1 + M = scipy.sparse.csr_matrix( + (Mnx.weight, (Mnx[SRC_COL], Mnx[DST_COL])), shape=(N, N) + ) return M, graph_file @@ -135,7 +147,7 @@ def extract_two_hop(read_csv): G = graph_file.get_graph(ignore_weights=True) pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) @@ -144,93 +156,91 @@ def extract_two_hop(read_csv): # Test @pytest.mark.sg -def test_overlap(gpubenchmark, read_csv, extract_two_hop): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_overlap(gpubenchmark, read_csv, extract_two_hop, use_weight): M, graph_file = read_csv pairs = extract_two_hop - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) + cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, use_weight=use_weight) + cpu_coeff = cpu_call(M, pairs[VERTEX_PAIR_FIRST_COL], pairs[VERTEX_PAIR_SECOND_COL]) compare_overlap(cu_coeff, cpu_coeff) -# Test @pytest.mark.sg -def test_overlap_edge_vals(gpubenchmark, read_csv, extract_two_hop): +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(graph_file, use_weight): + M = utils.read_csv_for_nx(graph_file.get_path()) + cu_M = cudf.DataFrame() + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) - M, graph_file = read_csv - pairs = extract_two_hop + G1 = cugraph.Graph(directed=True) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, edgevals=True) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) + vertex_pair = cu_M[[SRC_COL, DST_COL]] - compare_overlap(cu_coeff, cpu_coeff) + vertex_pair = vertex_pair[:5] + with pytest.raises(ValueError): + cugraph.overlap(G1, vertex_pair, use_weight) @pytest.mark.sg @pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_overlap_multi_column(graph_file): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_overlap_multi_column(graph_file, use_weight): dataset_path = graph_file.get_path() M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] - df_res = cugraph.overlap(G1, vertex_pair) - df_plc_exp = exp_overlap(G1, vertex_pair) - - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) - overlap_res = df_res["overlap_coeff"].sort_values().reset_index(drop=True) - overlap_plc_exp = df_plc_exp["overlap_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(overlap_res, overlap_plc_exp) - + df_multi_col_res = cugraph.overlap(G1, vertex_pair, use_weight=use_weight) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight + ) + df_single_col_res = cugraph.overlap( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[OVERLAP_COEFF_COL], expected[OVERLAP_COEFF_COL]) @pytest.mark.sg -def test_weighted_exp_overlap(): +def test_weighted_overlap(): karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - exp_overlap(G) - G = karate.get_graph(ignore_weights=True) - use_weight = True - with pytest.raises(ValueError): - exp_overlap(G, use_weight=use_weight) - - -@pytest.mark.sg -def test_invalid_datasets_overlap(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(ValueError): - cugraph.overlap(G) + cugraph.overlap(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 3da33a3e853..6b4074fce30 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -20,11 +20,19 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience -from cugraph.experimental import sorensen as exp_sorensen -from cudf.testing import assert_series_equal, assert_frame_equal +from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal - -print("Networkx version : {} ".format(nx.__version__)) +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +SORENSEN_COEFF_COL = "sorensen_coeff" +EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" # ============================================================================= @@ -37,68 +45,89 @@ def setup_function(): # ============================================================================= # Helper functions # ============================================================================= -def compare_sorensen_two_hop(G, Gnx, edgevals=False): +def compare_sorensen_two_hop(G, Gnx, use_weight=False): """ Compute both cugraph and nx sorensen after extracting the two hop neighbors from G and compare both results """ pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) - nx_pairs = [] - nx_pairs = list(pairs.to_records(index=False)) - preds = nx.jaccard_coefficient(Gnx, nx_pairs) - nx_coeff = [] - for u, v, p in preds: + + # print(f'G = {G.edgelist.edgelist_df}') + + df = cugraph.sorensen(G, pairs) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) + + if not use_weight: + nx_pairs = list(pairs.to_records(index=False)) + + # print(f'nx_pairs = {len(nx_pairs)}') + + preds = nx.jaccard_coefficient(Gnx, nx_pairs) + # FIXME: Use known correct values of Sorensen for few graphs, # hardcode it and compare to Cugraph Sorensen to get a more robust test # Conversion from Networkx Jaccard to Sorensen # No networkX equivalent - nx_coeff.append((2 * p) / (1 + p)) - df = cugraph.sorensen(G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental sorensen currently only supports unweighted graphs - df_exp = exp_sorensen(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) - assert len(nx_coeff) == len(df) - for i in range(len(df)): - diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i]) - assert diff < 1.0e-6 - - -def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None): + + nx_coeff = list(map(lambda x: (2 * x[2]) / (1 + x[2]), preds)) + + assert len(nx_coeff) == len(df) + for i in range(len(df)): + diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i]) + assert diff < 1.0e-6 + else: + # FIXME: compare results against resultset api + res_w_sorensen = cugraph.sorensen_w(G, vertex_pair=pairs) + res_w_sorensen = res_w_sorensen.sort_values( + [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL] + ).reset_index(drop=True) + assert_frame_equal(res_w_sorensen, df, check_dtype=False, check_like=True) + + +def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False): G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=not edgevals) + G = graph_file.get_graph(ignore_weights=not use_weight) # If no vertex_pair is passed as input, 'cugraph.sorensen' will # compute the 'sorensen_similarity' with the two_hop_neighbor of the # entire graph while nx compute with the one_hop_neighbor. For better # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.sorensen' # and pass it as vertex_pair - vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) - vertex_pair = vertex_pair[["first", "second"]] + if isinstance(input_df, cudf.DataFrame): + vertex_pair = input_df.rename( + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} + ) + vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] + else: + vertex_pair = cudf.DataFrame( + columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL], + dtype=G.edgelist.edgelist_df["src"].dtype, + ) # cugraph Sorensen Call df = benchmark_callable(cugraph.sorensen, G, vertex_pair=vertex_pair) - df = df.sort_values(["first", "second"]).reset_index(drop=True) + df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) return ( - df["first"].to_numpy(), - df["second"].to_numpy(), - df["sorensen_coeff"].to_numpy(), + df[VERTEX_PAIR_FIRST_COL].to_numpy(), + df[VERTEX_PAIR_SECOND_COL].to_numpy(), + df[SORENSEN_COEFF_COL].to_numpy(), ) def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] + sources = M[SRC_COL] + destinations = M[DST_COL] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) @@ -110,7 +139,11 @@ def networkx_call(M, benchmark_callable=None): print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) # Networkx Jaccard Call @@ -149,10 +182,12 @@ def read_csv(request): @pytest.mark.sg -def test_sorensen(gpubenchmark, read_csv): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen(gpubenchmark, read_csv, use_weight): M_cu, M, graph_file = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight + ) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -170,20 +205,42 @@ def test_sorensen(gpubenchmark, read_csv): @pytest.mark.sg def test_nx_sorensen_time(gpubenchmark, read_csv): - _, M, _ = read_csv nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) +@pytest.mark.sg +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(read_csv, use_weight): + _, M, _ = read_csv + + cu_M = cudf.DataFrame() + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + + G1 = cugraph.Graph(directed=True) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) + + vertex_pair = cu_M[[SRC_COL, DST_COL]] + + vertex_pair = vertex_pair[:5] + with pytest.raises(ValueError): + cugraph.sorensen(G1, vertex_pair, use_weight) + + @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) +@pytest.mark.parametrize("use_weight", [False, True]) @pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") -def test_sorensen_edgevals(gpubenchmark, graph_file): +def test_sorensen_edgevals(gpubenchmark, graph_file, use_weight): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) M_cu = utils.read_csv_file(dataset_path) cu_src, cu_dst, cu_coeff = cugraph_call( - gpubenchmark, netscience, edgevals=True, input_df=M_cu + gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight ) nx_src, nx_dst, nx_coeff = networkx_call(M) @@ -201,92 +258,89 @@ def test_sorensen_edgevals(gpubenchmark, graph_file): @pytest.mark.sg -def test_sorensen_two_hop(read_csv): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_two_hop(read_csv, use_weight): _, M, graph_file = read_csv - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - G = graph_file.get_graph(ignore_weights=True) + Gnx = nx.from_pandas_edgelist( + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() + ) + G = graph_file.get_graph(ignore_weights=not use_weight) - compare_sorensen_two_hop(G, Gnx) + compare_sorensen_two_hop(G, Gnx, use_weight=use_weight) @pytest.mark.sg -def test_sorensen_two_hop_edge_vals(read_csv): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_two_hop_edge_vals(read_csv, use_weight): _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) - G = graph_file.get_graph() + G = graph_file.get_graph(ignore_weights=not use_weight) - compare_sorensen_two_hop(G, Gnx, edgevals=True) + compare_sorensen_two_hop(G, Gnx, use_weight=use_weight) @pytest.mark.sg -def test_sorensen_multi_column(read_csv): - - _, M, _ = read_csv +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_multi_column(graph_file, use_weight): + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] - df_res = cugraph.sorensen(G1, vertex_pair) - df_plc_exp = exp_sorensen(G1, vertex_pair) - - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) - sorensen_res = df_res["sorensen_coeff"].sort_values().reset_index(drop=True) - sorensen_plc_exp = df_plc_exp["sorensen_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(sorensen_res, sorensen_plc_exp) + df_multi_col_res = cugraph.sorensen(G1, vertex_pair) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight + ) + df_single_col_res = cugraph.sorensen( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL]) @pytest.mark.sg -def test_weighted_exp_sorensen(): +def test_weighted_sorensen(): karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - exp_sorensen(G) - G = karate.get_graph(ignore_weights=True) - use_weight = True - with pytest.raises(ValueError): - exp_sorensen(G, use_weight=use_weight) - - -@pytest.mark.sg -def test_invalid_datasets_sorensen(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(ValueError): - cugraph.sorensen(G) + cugraph.sorensen(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py deleted file mode 100644 index 36a21df46b8..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import numpy as np -import networkx as nx - -import cudf -import cugraph -from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal - - -print("Networkx version : {} ".format(nx.__version__)) - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -def cugraph_call(benchmark_callable, graph_file): - # Device data - cu_M = graph_file.get_edgelist() - weight_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32) - weights["weight"] = weight_arr - - G = graph_file.get_graph(ignore_weights=True) - - # cugraph Jaccard Call - df = benchmark_callable(cugraph.jaccard_w, G, weights) - - df = df.sort_values(["first", "second"]).reset_index(drop=True) - - return df["jaccard_coeff"] - - -def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] - edges = [] - for i in range(len(sources)): - edges.append((sources[i], destinations[i])) - edges.append((destinations[i], sources[i])) - edges = list(dict.fromkeys(edges)) - edges = sorted(edges) - # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this - # explicitly - print("Format conversion ... ") - - # NetworkX graph - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - # Networkx Jaccard Call - print("Solving... ") - if benchmark_callable is not None: - preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) - else: - preds = nx.jaccard_coefficient(Gnx, edges) - - coeff = [] - for u, v, p in preds: - coeff.append(p) - return coeff - - -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) -def read_csv(request): - """ - Read csv file for both networkx and cugraph - """ - graph_file = request.param - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - return M, graph_file - - -@pytest.mark.sg -def test_wjaccard(gpubenchmark, read_csv): - - M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_nx_wjaccard_time(gpubenchmark, read_csv): - - M, _ = read_csv - networkx_call(M, gpubenchmark) - - -@pytest.mark.sg -def test_wjaccard_multi_column_weights(gpubenchmark, read_csv): - - M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_wjaccard_multi_column(read_csv): - - M, _ = read_csv - - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.jaccard_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) - - -@pytest.mark.sg -def test_invalid_datasets_jaccard_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.jaccard_w(G, None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py deleted file mode 100644 index 1dffb9fca41..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import scipy -import numpy as np - -import cudf -import cugraph -from cudf.testing import assert_series_equal -from cugraph.testing import utils, UNDIRECTED_DATASETS - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -def cugraph_call(benchmark_callable, graph_file, pairs): - # Device data - cu_M = graph_file.get_edgelist() - weights_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weights_arr), dtype=np.int32) - weights["weight"] = weights_arr - - G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) - - # cugraph Overlap Call - df = benchmark_callable(cugraph.overlap_w, G, weights, pairs) - - df = df.sort_values(by=["first", "second"]) - return df["overlap_coeff"].to_numpy() - - -def intersection(a, b, M): - count = 0 - a_idx = M.indptr[a] - b_idx = M.indptr[b] - - while (a_idx < M.indptr[a + 1]) and (b_idx < M.indptr[b + 1]): - a_vertex = M.indices[a_idx] - b_vertex = M.indices[b_idx] - - if a_vertex == b_vertex: - count += 1 - a_idx += 1 - b_idx += 1 - elif a_vertex < b_vertex: - a_idx += 1 - else: - b_idx += 1 - - return count - - -def degree(a, M): - return M.indptr[a + 1] - M.indptr[a] - - -def overlap(a, b, M): - b_sum = degree(b, M) - if b_sum == 0: - return float("NaN") - - i = intersection(a, b, M) - a_sum = degree(a, M) - total = min(a_sum, b_sum) - return i / total - - -def cpu_call(M, first, second): - result = [] - for i in range(len(first)): - result.append(overlap(first[i], second[i], M)) - return result - - -@pytest.mark.sg -@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_woverlap(gpubenchmark, graph_file): - dataset_path = graph_file.get_path() - Mnx = utils.read_csv_for_nx(dataset_path) - N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 - M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) - - G = graph_file.get_graph(ignore_weights=True) - pairs = ( - G.get_two_hop_neighbors() - .sort_values(["first", "second"]) - .reset_index(drop=True) - ) - - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) - assert len(cu_coeff) == len(cpu_coeff) - for i in range(len(cu_coeff)): - if np.isnan(cpu_coeff[i]): - assert np.isnan(cu_coeff[i]) - elif np.isnan(cu_coeff[i]): - assert cpu_coeff[i] == cu_coeff[i] - else: - diff = abs(cpu_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_woverlap_multi_column(graph_file): - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.overlap_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) - - -@pytest.mark.sg -def test_invalid_datasets_overlap_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.overlap_w(G, None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py deleted file mode 100644 index 8d09b3e25b3..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import numpy as np -import networkx as nx - -import cudf -import cugraph -from cudf.testing import assert_series_equal -from cugraph.testing import utils, UNDIRECTED_DATASETS - - -print("Networkx version : {} ".format(nx.__version__)) - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -def cugraph_call(benchmark_callable, graph_file): - # Device data - cu_M = graph_file.get_edgelist() - weight_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32) - weights["weight"] = weight_arr - - G = graph_file.get_graph(ignore_weights=True) - - # cugraph Sorensen Call - df = benchmark_callable(cugraph.sorensen_w, G, weights) - - df = df.sort_values(["first", "second"]).reset_index(drop=True) - - return df["sorensen_coeff"] - - -def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] - edges = [] - for i in range(len(sources)): - edges.append((sources[i], destinations[i])) - edges.append((destinations[i], sources[i])) - edges = list(dict.fromkeys(edges)) - edges = sorted(edges) - # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this - # explicitly - print("Format conversion ... ") - - # NetworkX graph - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - # Networkx Jaccard Call - print("Solving... ") - if benchmark_callable is not None: - preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) - else: - preds = nx.jaccard_coefficient(Gnx, edges) - coeff = [] - for u, v, p in preds: - # FIXME: Use known correct values of WSorensen for few graphs, - # hardcode it and compare to Cugraph WSorensen - # to get a more robust test - - # Conversion from Networkx Jaccard to Sorensen - coeff.append((2 * p) / (1 + p)) - return coeff - - -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) -def read_csv(request): - """ - Read csv file for both networkx and cugraph - """ - graph_file = request.param - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - return M, graph_file - - -@pytest.mark.sg -def test_wsorensen(gpubenchmark, read_csv): - - M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_nx_wsorensen_time(gpubenchmark, read_csv): - - M, _ = read_csv - networkx_call(M, gpubenchmark) - - -@pytest.mark.sg -def test_wsorensen_multi_column_weights(gpubenchmark, read_csv): - - M, cu_M = read_csv - - cu_coeff = cugraph_call(gpubenchmark, cu_M) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_wsorensen_multi_column(read_csv): - - M, _ = read_csv - - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.sorensen_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) - - -@pytest.mark.sg -def test_invalid_datasets_sorensen_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.sorensen_w(G, None) diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 711652bbae6..45f6de2f663 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -87,6 +87,13 @@ from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists +from pylibcugraph.jaccard_coefficients import jaccard_coefficients + +from pylibcugraph.overlap_coefficients import overlap_coefficients + +from pylibcugraph.sorensen_coefficients import sorensen_coefficients + + from pylibcugraph import exceptions __version__ = "23.10.00" diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 1b93f9322af..6194ace5956 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -74,18 +74,17 @@ from pylibcugraph.node2vec import node2vec -node2vec = promoted_experimental_warning_wrapper(node2vec) -from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients +# from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients -jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients) +# jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients) -from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients +# from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients -overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients) +# overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients) -from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients +# from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients -sorensen_coefficients = experimental_warning_wrapper( - EXPERIMENTAL__sorensen_coefficients -) +# sorensen_coefficients = experimental_warning_wrapper( +# EXPERIMENTAL__sorensen_coefficients +# ) diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx index 805ee821eab..59e94aeb615 100644 --- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -15,6 +15,8 @@ # cython: language_level = 3 from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -57,7 +59,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle, +def jaccard_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, @@ -83,8 +85,10 @@ def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx index 6af71116469..28360121c64 100644 --- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -57,7 +57,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle, +def overlap_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, @@ -84,8 +84,10 @@ def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx index 12647baccb2..983a635012f 100644 --- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -57,7 +57,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle, +def sorensen_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, @@ -83,8 +83,10 @@ def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure