From 8f7fec98ba90a765593734d2307b559988f60f8d Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Wed, 31 Jul 2024 06:25:03 +0100 Subject: [PATCH] Expose new all-pairs Similarity algorithms (#4502) A variation of the Similarity algorithms leveraging of all-pairs between vertices is available in the C and C++ API. It also enable the retrieval of the top k vertices. This PR: - Exposes the new all-pairs Similarity algorithm to the PLC and python API - Add SG and MG python tests - Add docstrings with examples closes #4470 Authors: - Joseph Nke (https://github.com/jnke2016) - Ralph Liu (https://github.com/nv-rliu) - Rick Ratzel (https://github.com/rlratzel) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4502 --- cpp/include/cugraph_c/similarity_algorithms.h | 66 ++ cpp/src/c_api/similarity.cpp | 115 +++ cpp/tests/c_api/mg_similarity_test.c | 769 ++++++++++++++++-- cpp/tests/c_api/similarity_test.c | 213 ++++- python/cugraph/cugraph/__init__.py | 6 + python/cugraph/cugraph/dask/__init__.py | 5 + .../cugraph/dask/link_prediction/cosine.py | 323 ++++++++ .../cugraph/dask/link_prediction/jaccard.py | 182 ++++- .../cugraph/dask/link_prediction/overlap.py | 183 ++++- .../cugraph/dask/link_prediction/sorensen.py | 183 ++++- .../cugraph/link_prediction/__init__.py | 6 + .../cugraph/cugraph/link_prediction/cosine.py | 359 ++++++++ .../cugraph/link_prediction/jaccard.py | 119 ++- .../cugraph/link_prediction/overlap.py | 121 ++- .../cugraph/link_prediction/sorensen.py | 123 ++- .../tests/link_prediction/test_cosine_mg.py | 292 +++++++ .../tests/link_prediction/test_jaccard.py | 89 +- .../tests/link_prediction/test_jaccard_mg.py | 135 ++- .../tests/link_prediction/test_overlap.py | 150 +++- .../tests/link_prediction/test_overlap_mg.py | 135 ++- .../tests/link_prediction/test_sorensen.py | 149 +++- .../tests/link_prediction/test_sorensen_mg.py | 135 ++- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 5 + python/pylibcugraph/pylibcugraph/__init__.py | 10 + .../_cugraph_c/similarity_algorithms.pxd | 76 +- .../all_pairs_cosine_coefficients.pyx | 164 ++++ .../all_pairs_jaccard_coefficients.pyx | 164 ++++ .../all_pairs_overlap_coefficients.pyx | 164 ++++ .../all_pairs_sorensen_coefficients.pyx | 164 ++++ .../pylibcugraph/cosine_coefficients.pyx | 171 ++++ python/pylibcugraph/pylibcugraph/utils.pxd | 5 +- 31 files changed, 4631 insertions(+), 150 deletions(-) create mode 100644 python/cugraph/cugraph/dask/link_prediction/cosine.py create mode 100644 python/cugraph/cugraph/link_prediction/cosine.py create mode 100644 python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx diff --git a/cpp/include/cugraph_c/similarity_algorithms.h b/cpp/include/cugraph_c/similarity_algorithms.h index 5b8462a1666..12f55132fc7 100644 --- a/cpp/include/cugraph_c/similarity_algorithms.h +++ b/cpp/include/cugraph_c/similarity_algorithms.h @@ -145,6 +145,34 @@ cugraph_error_code_t cugraph_overlap_coefficients(const cugraph_resource_handle_ cugraph_similarity_result_t** result, cugraph_error_t** error); +/** + * @brief Perform cosine similarity computation + * + * Compute the similarity for the specified vertex_pairs + * + * Note that cosine similarity must run on a symmetric graph. + * + * @param [in] handle Handle for accessing resources + * @param [in] graph Pointer to graph + * @param [in] vertex_pairs Vertex pair for input + * @param [in] use_weight If true consider the edge weight in the graph, if false use an + * edge weight of 1 + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if set to + * `true`). + * @param [out] result Opaque pointer to similarity results + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error); + /** * @brief Perform All-Pairs Jaccard similarity computation * @@ -259,6 +287,44 @@ cugraph_error_code_t cugraph_all_pairs_overlap_coefficients( cugraph_similarity_result_t** result, cugraph_error_t** error); +/** + * @brief Perform All Pairs cosine similarity computation + * + * Compute the similarity for all vertex pairs derived from the two-hop neighbors + * of an optional specified vertex list. This function will identify the two-hop + * neighbors of the specified vertices (all vertices in the graph if not specified) + * and compute similarity for those vertices. + * + * If the topk parameter is specified then the result will only contain the top k + * highest scoring results. + * + * Note that cosine similarity must run on a symmetric graph. + * + * @param [in] handle Handle for accessing resources + * @param [in] graph Pointer to graph + * @param [in] vertices Vertex list for input. If null then compute based on + * all vertices in the graph. + * @param [in] use_weight If true consider the edge weight in the graph, if false use an + * edge weight of 1 + * @param [in] topk Specify how many answers to return. Specifying SIZE_MAX + * will return all values. + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if set to + * `true`). + * @param [out] result Opaque pointer to similarity results + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error); + #ifdef __cplusplus } #endif diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index aa54fc6dee7..36f1a74f3e0 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -212,6 +212,22 @@ struct all_pairs_similarity_functor : public cugraph::c_api::abstract_functor { : std::nullopt, topk_ != SIZE_MAX ? std::make_optional(topk_) : std::nullopt); + cugraph::unrenumber_int_vertices( + handle_, + v1.data(), + v1.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); + + cugraph::unrenumber_int_vertices( + handle_, + v2.data(), + v2.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); + result_ = new cugraph::c_api::cugraph_similarity_result_t{ new cugraph::c_api::cugraph_type_erased_device_array_t(similarity_coefficients, graph_->weight_type_), @@ -274,6 +290,33 @@ struct sorensen_functor { } }; +struct cosine_functor { + template + rmm::device_uvector operator()( + raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs) + { + return cugraph::cosine_similarity_coefficients( + handle, graph_view, edge_weight_view, vertex_pairs); + } + + template + std::tuple, + rmm::device_uvector, + rmm::device_uvector> + operator()(raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk) + { + return cugraph::cosine_similarity_all_pairs_coefficients( + handle, graph_view, edge_weight_view, vertices, topk); + } +}; + struct overlap_functor { template rmm::device_uvector operator()( @@ -300,6 +343,33 @@ struct overlap_functor { } }; +struct cosine_similarity_functor { + template + rmm::device_uvector operator()( + raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs) + { + return cugraph::cosine_similarity_coefficients( + handle, graph_view, edge_weight_view, vertex_pairs); + } + + template + std::tuple, + rmm::device_uvector, + rmm::device_uvector> + operator()(raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk) + { + return cugraph::cosine_similarity_all_pairs_coefficients( + handle, graph_view, edge_weight_view, vertices, topk); + } +}; + } // namespace extern "C" cugraph_type_erased_device_array_view_t* cugraph_similarity_result_get_similarity( @@ -391,6 +461,28 @@ extern "C" cugraph_error_code_t cugraph_overlap_coefficients( return cugraph::c_api::run_algorithm(graph, functor, result, error); } +extern "C" cugraph_error_code_t cugraph_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error) +{ + if (use_weight) { + CAPI_EXPECTS( + reinterpret_cast(graph)->edge_weights_ != nullptr, + CUGRAPH_INVALID_INPUT, + "use_weight is true but edge weights are not provided.", + *error); + } + similarity_functor functor( + handle, graph, vertex_pairs, cosine_similarity_functor{}, use_weight, do_expensive_check); + + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} + extern "C" cugraph_error_code_t cugraph_all_pairs_jaccard_coefficients( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, @@ -459,3 +551,26 @@ extern "C" cugraph_error_code_t cugraph_all_pairs_overlap_coefficients( return cugraph::c_api::run_algorithm(graph, functor, result, error); } + +extern "C" cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error) +{ + if (use_weight) { + CAPI_EXPECTS( + reinterpret_cast(graph)->edge_weights_ != nullptr, + CUGRAPH_INVALID_INPUT, + "use_weight is true but edge weights are not provided.", + *error); + } + all_pairs_similarity_functor functor( + handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check); + + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 587acb4d295..486ca34aaca 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -26,7 +26,16 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t; +typedef enum { + JACCARD, + SORENSEN, + OVERLAP, + COSINE, + ALL_PAIRS_JACCARD, + ALL_PAIRS_SORENSEN, + ALL_PAIRS_OVERLAP, + ALL_PAIRS_COSINE +} similarity_t; int generic_similarity_test(const cugraph_resource_handle_t* handle, vertex_t* h_src, @@ -34,10 +43,13 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, weight_t* h_wgt, vertex_t* h_first, vertex_t* h_second, + vertex_t* h_start_vertices, weight_t* h_result, size_t num_vertices, size_t num_edges, size_t num_pairs, + size_t num_start_vertices, + size_t topk, bool_t store_transposed, bool_t use_weight, similarity_t test_type) @@ -48,13 +60,15 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; cugraph_error_t* ret_error; - cugraph_graph_t* graph = NULL; - cugraph_similarity_result_t* result = NULL; - cugraph_vertex_pairs_t* vertex_pairs = NULL; - cugraph_type_erased_device_array_t* v1 = NULL; - cugraph_type_erased_device_array_t* v2 = NULL; - cugraph_type_erased_device_array_view_t* v1_view = NULL; - cugraph_type_erased_device_array_view_t* v2_view = NULL; + cugraph_graph_t* graph = NULL; + cugraph_similarity_result_t* result = NULL; + cugraph_vertex_pairs_t* vertex_pairs = NULL; + cugraph_type_erased_device_array_t* v1 = NULL; + cugraph_type_erased_device_array_t* v2 = NULL; + cugraph_type_erased_device_array_t* start_v = NULL; + cugraph_type_erased_device_array_view_t* v1_view = NULL; + cugraph_type_erased_device_array_view_t* v2_view = NULL; + cugraph_type_erased_device_array_view_t* start_v_view = NULL; ret_code = create_test_graph( handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, TRUE, &graph, &ret_error); @@ -62,44 +76,81 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed."); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + if (topk == 0) { topk = SIZE_MAX; } + if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0; } - ret_code = - cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); + if (h_first != NULL && h_second != NULL) { + ret_code = + cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed."); + + v1_view = cugraph_type_erased_device_array_view(v1); + v2_view = cugraph_type_erased_device_array_view(v2); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, v1_view, (byte_t*)h_first, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed."); - ret_code = - cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed."); + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, v2_view, (byte_t*)h_second, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed."); - v1_view = cugraph_type_erased_device_array_view(v1); - v2_view = cugraph_type_erased_device_array_view(v2); + ret_code = + cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed."); + } - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, v1_view, (byte_t*)h_first, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed."); + if (h_start_vertices != NULL) { + ret_code = cugraph_type_erased_device_array_create( + handle, num_start_vertices, vertex_tid, &start_v, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); + start_v_view = cugraph_type_erased_device_array_view(start_v); - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, v2_view, (byte_t*)h_second, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed."); + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, start_v_view, (byte_t*)h_start_vertices, &ret_error); - ret_code = - cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed."); + TEST_ASSERT( + test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_start_vertices copy_from_host failed."); + } switch (test_type) { case JACCARD: ret_code = cugraph_jaccard_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_JACCARD: + ret_code = cugraph_all_pairs_jaccard_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; case SORENSEN: ret_code = cugraph_sorensen_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_SORENSEN: + ret_code = cugraph_all_pairs_sorensen_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; case OVERLAP: ret_code = cugraph_overlap_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_OVERLAP: + ret_code = cugraph_all_pairs_overlap_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; + case COSINE: + ret_code = cugraph_cosine_similarity_coefficients( + handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); + break; + case ALL_PAIRS_COSINE: + ret_code = cugraph_all_pairs_cosine_similarity_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -109,6 +160,21 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, similarity_coefficient = cugraph_similarity_result_get_similarity(result); + switch (test_type) { + case ALL_PAIRS_JACCARD: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + case ALL_PAIRS_SORENSEN: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + case ALL_PAIRS_OVERLAP: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + case ALL_PAIRS_COSINE: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + } + weight_t h_similarity_coefficient[num_pairs]; ret_code = cugraph_type_erased_device_array_view_copy_to_host( @@ -131,15 +197,18 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, int test_jaccard(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; - - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.2, 0.666667, 0.333333, 0.4, 0.166667, 0.5, 0.2, 0.25, 0.25, 0.666667}; return generic_similarity_test(handle, @@ -148,10 +217,13 @@ int test_jaccard(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, JACCARD); @@ -159,18 +231,21 @@ int test_jaccard(const cugraph_resource_handle_t* handle) int test_weighted_jaccard(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.357143, 0.208333, 0.0}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(handle, h_src, @@ -178,26 +253,137 @@ int test_weighted_jaccard(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, TRUE, JACCARD); } +int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, + 0.25, 0.4, 0.2, 0.25, 0.25, 0.666667, 0.166667, 0.2, + 0.666667, 0.3333333, 0.25, 0.666667, 0.5, 0.25}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_JACCARD); +} + +int test_all_pairs_jaccard_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_JACCARD); +} + +int test_all_pairs_jaccard_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.666667, 0.666667, 0.666667, 0.666667, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_JACCARD); +} + int test_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; - - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.333333, 0.8, 0.5, 0.571429, 0.285714, 0.666667, 0.333333, 0.4, 0.4, 0.8}; return generic_similarity_test(handle, @@ -206,10 +392,13 @@ int test_sorensen(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, SORENSEN); @@ -217,18 +406,21 @@ int test_sorensen(const cugraph_resource_handle_t* handle) int test_weighted_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.526316, 0.344828, 0.000000}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(handle, h_src, @@ -236,27 +428,81 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, TRUE, SORENSEN); } -int test_overlap(const cugraph_resource_handle_t* handle) +int test_all_pairs_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, + 0.4, 0.571429, 0.333333, 0.4, 0.4, 0.8, 0.285714, 0.333333, + 0.8, 0.5, 0.4, 0.8, 0.666667, 0.4}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1}; +int test_all_pairs_sorensen_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.333333, + 0.4, + 0.8, + 0.5, + 0.333333, + 0.571429, + 0.285714, + 0.666667, + 0.4, + 0.571429, + 0.333333, + 0.4, + 0.4}; return generic_similarity_test(handle, h_src, @@ -264,10 +510,81 @@ int test_overlap(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} + +int test_all_pairs_sorensen_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.8, 0.8, 0.8, 0.8, 0.666667}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} + +int test_overlap(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, FALSE, FALSE, OVERLAP); @@ -275,18 +592,21 @@ int test_overlap(const cugraph_resource_handle_t* handle) int test_weighted_overlap(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(handle, h_src, @@ -294,15 +614,298 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, TRUE, OVERLAP); } +int test_all_pairs_overlap(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + +int test_all_pairs_overlap_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + +int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + +int test_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + COSINE); +} + +int test_weighted_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 2; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0}; + vertex_t h_second[] = {1, 2}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.990830, 0.976187}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + TRUE, + COSINE); +} + +int test_all_pairs_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + +int test_all_pairs_cosine_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + +int test_all_pairs_cosine_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + /******************************************************************************/ int main(int argc, char** argv) @@ -311,12 +914,30 @@ int main(int argc, char** argv) cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle); int result = 0; + result |= RUN_MG_TEST(test_jaccard, handle); + result |= RUN_MG_TEST(test_weighted_jaccard, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard_with_topk, handle); + result |= RUN_MG_TEST(test_sorensen, handle); + result |= RUN_MG_TEST(test_weighted_sorensen, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen_with_topk, handle); + result |= RUN_MG_TEST(test_overlap, handle); - // result |= RUN_MG_TEST(test_weighted_jaccard, handle); - // result |= RUN_MG_TEST(test_weighted_sorensen, handle); - // result |= RUN_MG_TEST(test_weighted_overlap, handle); + result |= RUN_MG_TEST(test_weighted_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap_with_topk, handle); + + result |= RUN_MG_TEST(test_cosine, handle); + result |= RUN_MG_TEST(test_weighted_cosine, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine_with_topk, handle); cugraph_free_resource_handle(handle); free_mg_raft_handle(raft_handle); diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index c29af658ce9..70e0cb6fb95 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -26,7 +26,7 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t; +typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE } similarity_t; int generic_similarity_test(vertex_t* h_src, vertex_t* h_dst, @@ -101,6 +101,10 @@ int generic_similarity_test(vertex_t* h_src, ret_code = cugraph_overlap_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case COSINE: + ret_code = cugraph_cosine_similarity_coefficients( + handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -179,6 +183,10 @@ int generic_all_pairs_similarity_test(vertex_t* h_src, ret_code = cugraph_all_pairs_overlap_coefficients( handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error); break; + case COSINE: + ret_code = cugraph_all_pairs_cosine_similarity_coefficients( + handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -333,7 +341,7 @@ int test_weighted_sorensen() vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.526316, 0.344828, 0.000000}; + weight_t h_result[] = {0.526316, 0.344828, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -389,7 +397,7 @@ int test_weighted_overlap() vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + weight_t h_result[] = {0.714286, 0.416667, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -405,6 +413,62 @@ int test_weighted_overlap() OVERLAP); } +int test_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + COSINE); +} + +int test_weighted_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 2; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0}; + vertex_t h_second[] = {1, 2}; + weight_t h_result[] = {0.990830, 0.976187}; + + return generic_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + COSINE); +} + int test_all_pairs_jaccard() { size_t num_edges = 16; @@ -631,6 +695,67 @@ int test_weighted_all_pairs_overlap() OVERLAP); } +int test_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 22; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; + vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + SIZE_MAX, + COSINE); +} + +int test_weighted_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 6; + size_t topk = 6; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 1, 1, 2, 3, 4}; + vertex_t h_second[] = {1, 0, 2, 1, 4, 3}; + weight_t h_result[] = {0.0, 0.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + topk, + COSINE); +} + int test_all_pairs_jaccard_topk() { size_t num_edges = 16; @@ -812,28 +937,110 @@ int test_weighted_all_pairs_overlap_topk() OVERLAP); } +int test_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t topk = 6; + size_t num_pairs = 6; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 1, 3, 3, 4, 5}; + vertex_t h_second[] = {3, 5, 0, 4, 3, 1}; + weight_t h_result[] = {1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + topk, + COSINE); +} + +int test_weighted_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 16; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6}; + vertex_t h_second[] = {1, 2, 0, 2, 0, 1, 4, 5, 6, 3, 5, 6, 3, 4, 3, 4}; + weight_t h_result[] = {0.714286, + 0.416667, + 0.714286, + 1, + 0.416667, + 1, + 1, + 0.166667, + 0.5, + 1, + 0.571429, + 0.75, + 0.166667, + 0.571429, + 0.5, + 0.75}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + SIZE_MAX, + COSINE); +} + /******************************************************************************/ int main(int argc, char** argv) { int result = 0; + result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); + result |= RUN_TEST(test_cosine); result |= RUN_TEST(test_weighted_jaccard); result |= RUN_TEST(test_weighted_sorensen); result |= RUN_TEST(test_weighted_overlap); + result |= RUN_TEST(test_weighted_cosine); result |= RUN_TEST(test_all_pairs_jaccard); result |= RUN_TEST(test_all_pairs_sorensen); result |= RUN_TEST(test_all_pairs_overlap); + result |= RUN_TEST(test_all_pairs_cosine); result |= RUN_TEST(test_weighted_all_pairs_jaccard); result |= RUN_TEST(test_weighted_all_pairs_sorensen); result |= RUN_TEST(test_weighted_all_pairs_overlap); + result |= RUN_TEST(test_weighted_all_pairs_cosine); result |= RUN_TEST(test_all_pairs_jaccard_topk); result |= RUN_TEST(test_all_pairs_sorensen_topk); result |= RUN_TEST(test_all_pairs_overlap_topk); + result |= RUN_TEST(test_all_pairs_cosine_topk); result |= RUN_TEST(test_weighted_all_pairs_jaccard_topk); result |= RUN_TEST(test_weighted_all_pairs_sorensen_topk); result |= RUN_TEST(test_weighted_all_pairs_overlap_topk); + result |= RUN_TEST(test_weighted_all_pairs_cosine_topk); + return result; } diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index ba7e23df800..ada1fec74cb 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -76,10 +76,16 @@ from cugraph.link_prediction import ( jaccard, jaccard_coefficient, + all_pairs_jaccard, overlap, overlap_coefficient, + all_pairs_overlap, sorensen, sorensen_coefficient, + all_pairs_sorensen, + cosine, + cosine_coefficient, + all_pairs_cosine, ) from cugraph.traversal import ( diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index a76f1460575..6d86982142b 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -33,8 +33,13 @@ from .centrality.betweenness_centrality import edge_betweenness_centrality from .cores.k_core import k_core from .link_prediction.jaccard import jaccard +from .link_prediction.jaccard import all_pairs_jaccard from .link_prediction.sorensen import sorensen +from .link_prediction.sorensen import all_pairs_sorensen from .link_prediction.overlap import overlap +from .link_prediction.overlap import all_pairs_overlap +from .link_prediction.cosine import cosine +from .link_prediction.cosine import all_pairs_cosine from .community.leiden import leiden # Avoid "p2p" shuffling in dask for now diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py new file mode 100644 index 00000000000..e4007ad96d5 --- /dev/null +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -0,0 +1,323 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from dask.distributed import wait, default_client +import cugraph.dask.comms.comms as Comms +import dask_cudf +import cudf +from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers +from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) + + +from pylibcugraph import ( + cosine_coefficients as pylibcugraph_cosine_coefficients, + all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients, +) +from pylibcugraph import ResourceHandle + + +def convert_to_cudf(cp_arrays): + """ + Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper + """ + + cupy_first, cupy_second, cupy_similarity = cp_arrays + + df = cudf.DataFrame() + df["first"] = cupy_first + df["second"] = cupy_second + df["cosine_coeff"] = cupy_similarity + + return df + + +def _call_plc_all_pairs_cosine( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_cosine_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + +def _call_plc_cosine( + sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name +): + + first = vertex_pair[vertex_pair_col_name[0]] + second = vertex_pair[vertex_pair_col_name[1]] + + return pylibcugraph_cosine_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=do_expensive_check, + ) + + +def cosine(input_graph, vertex_pair=None, use_weight=False): + """ + Compute the Cosine similarity between each pair of vertices connected by + an edge, or between arbitrary pairs of vertices specified by the user. + Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.dask.cosine, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertex_pair : cudf.DataFrame, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices. If provided, the cosine coefficient is computed for the + given vertex pairs. If the vertex_pair is not provided then the + current implementation computes the cosine coefficient for all + adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 3 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['cosine_coeff']: dask_cudf.Series + The computed cosine coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + vertex_pair_col_name = vertex_pair.columns + + if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)): + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + + elif vertex_pair is not None: + raise ValueError("vertex_pair must be a dask_cudf or cudf dataframe") + + if not isinstance(vertex_pair, (dask_cudf.DataFrame)): + vertex_pair = dask_cudf.from_cudf( + vertex_pair, npartitions=len(Comms.get_workers()) + ) + vertex_pair = get_distributed_data(vertex_pair) + wait(vertex_pair) + vertex_pair = vertex_pair.worker_to_parts + + # Initialize dask client + client = default_client() + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_cosine, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_cosine( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Cosine similarity between all pairs of vertices specified. + All pairs Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. The Cosine + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_cosine, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the cosine coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 3 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['cosine_coeff']: dask_cudf.Series + The computed cosine coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, + ) + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_cosine, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 3b8edc8daa5..f72122048f9 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -18,10 +18,17 @@ import dask_cudf import cudf from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) + from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, + all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients, ) from pylibcugraph import ResourceHandle @@ -41,6 +48,20 @@ def convert_to_cudf(cp_arrays): return df +def _call_plc_all_pairs_jaccard( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_jaccard_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def _call_plc_jaccard( sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name ): @@ -63,7 +84,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. @@ -83,7 +104,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -100,7 +121,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). @@ -140,21 +161,148 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_jaccard, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, + result = [ + client.submit( + _call_plc_jaccard, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_jaccard( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Jaccard similarity between all pairs of vertices specified. + All pairs Jaccard similarity is defined between two sets as the ratio of the volume + of their intersection over the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Jaccard + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_jaccard, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the jaccard coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 3 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['jaccard_coeff']: dask_cudf.Series + The computed jaccard coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, ) - for w in Comms.get_workers() - ] + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_jaccard, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index 4bda05e3c95..e1a3285ee60 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,15 @@ import cudf from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair +from cugraph.dask import get_n_workers +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, + all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients, ) from pylibcugraph import ResourceHandle @@ -41,6 +47,20 @@ def convert_to_cudf(cp_arrays): return df +def _call_plc_all_pairs_overlap( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_overlap_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def _call_plc_overlap( sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name ): @@ -63,7 +83,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their @@ -86,7 +106,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -103,7 +123,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair(will be identical to first if specified). @@ -143,21 +163,148 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_overlap, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, + result = [ + client.submit( + _call_plc_overlap, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_overlap( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Overlap similarity between all pairs of vertices specified. + All pairs Overlap Coefficient is defined between two sets as the ratio of the volume + of their intersection over the smaller of their two volumes. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Overlap + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_overlap, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the overlap coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 3 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['overlap_coeff']: dask_cudf.Series + The computed overlap coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, ) - for w in Comms.get_workers() - ] + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_overlap, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 163b0d0dc16..3697385e8f8 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,16 @@ import dask_cudf import cudf from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, + all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients, ) from pylibcugraph import ResourceHandle @@ -58,12 +64,26 @@ def _call_plc_sorensen( ) +def _call_plc_all_pairs_sorensen( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_sorensen_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def sorensen(input_graph, vertex_pair=None, use_weight=False): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. + volume of their intersection over the volume of each set. If first is specified but second is not, or vice versa, an exception will be thrown. @@ -82,7 +102,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -99,7 +119,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair(will be identical to first if specified). @@ -139,21 +159,148 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_sorensen, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, + result = [ + client.submit( + _call_plc_sorensen, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_sorensen( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Sorensen similarity between all pairs of vertices specified. + All pairs Sorensen coefficient is defined between two sets as the ratio of twice the + volume of their intersection over the volume of each set. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Sorensen + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_sorensen, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the sorensen coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 3 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['sorensen_coeff']: dask_cudf.Series + The computed sorensen coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, ) - for w in Comms.get_workers() - ] + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_sorensen, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index 38c8b9a2d3b..f511b95c34c 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -13,7 +13,13 @@ from cugraph.link_prediction.jaccard import jaccard from cugraph.link_prediction.jaccard import jaccard_coefficient +from cugraph.link_prediction.jaccard import all_pairs_jaccard from cugraph.link_prediction.sorensen import sorensen from cugraph.link_prediction.sorensen import sorensen_coefficient +from cugraph.link_prediction.sorensen import all_pairs_sorensen from cugraph.link_prediction.overlap import overlap from cugraph.link_prediction.overlap import overlap_coefficient +from cugraph.link_prediction.overlap import all_pairs_overlap +from cugraph.link_prediction.cosine import cosine +from cugraph.link_prediction.cosine import cosine_coefficient +from cugraph.link_prediction.cosine import all_pairs_cosine diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py new file mode 100644 index 00000000000..9dce0e96f8c --- /dev/null +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -0,0 +1,359 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.utilities import ( + ensure_cugraph_obj_for_nx, + df_edge_score_to_dictionary, + renumber_vertex_pair, +) +import cudf +import warnings +from typing import Union, Iterable + +from pylibcugraph import ( + cosine_coefficients as pylibcugraph_cosine_coefficients, + all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients, +) +from pylibcugraph import ResourceHandle + +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if ( + vertex_pair_dtypes.iloc[0] != vertex_dtype + or vertex_pair_dtypes.iloc[1] != vertex_dtype + ): + warning_msg = ( + "Cosine requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + +def cosine( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + use_weight: bool = False, +): + """ + Compute the Cosine similarity between each pair of vertices connected by + an edge, or between arbitrary pairs of vertices specified by the user. + The Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.cosine, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertex_pair : cudf.DataFrame, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices. If provided, the cosine coefficient is computed for the + given vertex pairs. If the vertex_pair is not provided then the + current implementation computes the cosine coefficient for all + adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import cosine + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = cosine(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + + elif vertex_pair is not None: + raise ValueError("vertex_pair must be a cudf Dataframe") + + first, second, cosine_coeff = pylibcugraph_cosine_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["cosine_coeff"] = cudf.Series(cosine_coeff) + + return df + + +def cosine_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, +): + """ + Note: No NetworkX equivalent. + + Parameters + ---------- + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + the second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import cosine_coefficient + >>> G = karate.get_graph(download=True) + >>> df = cosine_coefficient(G) + + """ + vertex_pair = None + + G, isNx = ensure_cugraph_obj_for_nx(G) + + if isNx is True and ebunch is not None: + vertex_pair = cudf.DataFrame(ebunch) + + df = cosine(G, vertex_pair) + + if isNx is True: + df = df_edge_score_to_dictionary( + df, k="cosine_coeff", src="first", dst="second" + ) + + return df + + +def all_pairs_cosine( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Cosine similarity between all pairs of vertices specified. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + The All Pairs Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_cosine, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the cosine coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_cosine + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_cosine(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + first, second, cosine_coeff = pylibcugraph_all_pairs_cosine_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) + + df = vertex_pair + df["cosine_coeff"] = cudf.Series(cosine_coeff) + + return df diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 06644a7e1b7..214d92a1be5 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -22,6 +22,7 @@ from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, + all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients, ) from pylibcugraph import ResourceHandle @@ -65,7 +66,7 @@ def jaccard( Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. @@ -238,3 +239,119 @@ def jaccard_coefficient( ) return df + + +def all_pairs_jaccard( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Jaccard similarity between all pairs of vertices specified. + All pairs Jaccard similarity is defined between two sets as the ratio of the volume + of their intersection over the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Jaccard + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_jaccard, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the jaccard coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Jaccard weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['jaccard_coeff'] : cudf.Series + The computed Jaccard coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_jaccard + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_jaccard(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + first, second, jaccard_coeff = pylibcugraph_all_pairs_jaccard_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) + + df = vertex_pair + df["jaccard_coeff"] = cudf.Series(jaccard_coeff) + + return df diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index b6e9cfb58c4..52697d6b552 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -22,6 +22,7 @@ from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, + all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients, ) from pylibcugraph import ResourceHandle @@ -151,7 +152,7 @@ def overlap( Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their @@ -271,3 +272,121 @@ def overlap( df["overlap_coeff"] = cudf.Series(overlap_coeff) return df + + +def all_pairs_overlap( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Overlap Coefficient between each pair of vertices connected + by an edge, or between arbitrary pairs of vertices specified by the user. + Overlap Coefficient is defined between two sets as the ratio of the volume + of their intersection over the smaller of their two volumes. In the + context of graphs, the neighborhood of a vertex is seen as a set. The + Overlap Coefficient weight of each edge represents the strength of + connection between vertices based on the relative similarity of their + neighbors. + + cugraph.all_pairs_overlap, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the overlap coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Overlap weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['overlap_coeff'] : cudf.Series + The computed Overlap coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_overlap + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_overlap(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + first, second, overlap_coeff = pylibcugraph_all_pairs_overlap_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) + + df = vertex_pair + df["overlap_coeff"] = cudf.Series(overlap_coeff) + + return df diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index cac8bfb9cc6..8030234993b 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -22,6 +22,7 @@ from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, + all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients, ) from pylibcugraph import ResourceHandle @@ -66,7 +67,7 @@ def sorensen( Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. + volume of their intersection over the volume of each set. If first is specified but second is not, or vice versa, an exception will be thrown. @@ -209,8 +210,8 @@ def sorensen_coefficient( vertices or iterable of 2-tuples (u, v) where u and v are nodes in the graph. - If provided, the Overlap coefficient is computed for the given vertex - pairs. Otherwise, the current implementation computes the overlap + If provided, the Sorensen coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the sorensen coefficient for all adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) @@ -270,3 +271,119 @@ def sorensen_coefficient( ) return df + + +def all_pairs_sorensen( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): + """ + Compute the All Pairs Sorensen coefficient between each pair of vertices connected + by an edge, or between arbitrary pairs of vertices specified by the user. + Sorensen coefficient is defined between two sets as the ratio of twice the + volume of their intersection over the volume of each set. + If first is specified but second is not, or vice versa, an exception will + be thrown. + + cugraph.all_pairs_sorensen, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the sorensen coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Sorensen weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['sorensen_coeff'] : cudf.Series + The computed Sorensen coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_sorensen + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_sorensen(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + first, second, sorensen_coeff = pylibcugraph_all_pairs_sorensen_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) + + df = vertex_pair + df["sorensen_coeff"] = cudf.Series(sorensen_coeff) + + return df diff --git a/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py new file mode 100644 index 00000000000..f85508cb089 --- /dev/null +++ b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py @@ -0,0 +1,292 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest + +import dask_cudf +import cugraph +import cugraph.dask as dcg +from cugraph.testing import utils +from pylibcugraph.testing import gen_fixture_params_product + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = utils.DATASETS_UNDIRECTED + [ + utils.RAPIDS_DATASET_ROOT_DIR_PATH / "email-Eu-core.csv" +] + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), + (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), + (IS_WEIGHTED, "is_weighted"), +) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict( + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) + ) + + return parameters + + +@pytest.fixture(scope="module") +def input_expected_output(input_combo): + """ + This fixture returns the inputs and expected results from the Cosine algo. + (based on cuGraph Cosine) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertex_pair = input_combo["has_vertex_pair"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertex_pair: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + seeds = random.sample(range(G.number_of_vertices()), k) + + vertex_pair = G.get_two_hop_neighbors(start_vertices=seeds) + else: + vertex_pair = None + + input_combo["vertex_pair"] = vertex_pair + sg_cugraph_cosine = cugraph.cosine( + G, input_combo["vertex_pair"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_cosine + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Cosine algo. + (based on cuGraph Cosine) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + + if has_topk: + topk = 5 + else: + topk = None + + input_combo["vertices"] = vertices + print("vertices ", vertices, " is_weighted = ", is_weighted) + input_combo["topk"] = topk + sg_cugraph_all_pairs_cosine = cugraph.all_pairs_cosine( + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_cosine + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + +# ============================================================================= +# Tests +# ============================================================================= + + +@pytest.mark.mg +def test_dask_mg_cosine(dask_client, benchmark, input_expected_output): + + dg = input_expected_output["MGGraph"] + use_weight = input_expected_output["is_weighted"] + + result_cosine = benchmark( + dcg.cosine, dg, input_expected_output["vertex_pair"], use_weight=use_weight + ) + + result_cosine = ( + result_cosine.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"}) + ) + + expected_output = ( + input_expected_output["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Cosine results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"] + + cosine_coeff_diffs1 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001" + ) + cosine_coeff_diffs2 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001" + ) + + assert len(cosine_coeff_diffs1) == 0 + assert len(cosine_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_cosine( + dask_client, benchmark, input_expected_output_all_pairs +): + + dg = input_expected_output_all_pairs["MGGraph"] + + use_weight = input_expected_output_all_pairs["is_weighted"] + + result_cosine = benchmark( + dcg.all_pairs_cosine, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, + ) + + result_cosine = ( + result_cosine.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Cosine results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"] + + cosine_coeff_diffs1 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001" + ) + cosine_coeff_diffs2 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001" + ) + + assert len(cosine_coeff_diffs1) == 0 + assert len(cosine_coeff_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index 3691ad5a8c9..34ee72e799b 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -22,7 +22,7 @@ import cugraph from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -341,3 +341,90 @@ def test_weighted_jaccard(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.jaccard(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_jaccard(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + # Remove self loop + jaccard_results = jaccard_results[ + jaccard_results["first"] != jaccard_results["second"] + ].reset_index(drop=True) + + all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G) + + assert_frame_equal( + jaccard_results.head(), + all_pairs_jaccard_results.head(), + check_dtype=False, + check_like=True, + ) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_jaccard_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + # Remove self loop + jaccard_results = jaccard_results[ + jaccard_results["first"] != jaccard_results["second"] + ].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = jaccard_results["first"].isin(vertices) + mask_second = jaccard_results["second"].isin(vertices) + # mask = [v in vertices for v in (jaccard_results['first'].to_pandas() + # or jaccard_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + jaccard_results = jaccard_results[mask].reset_index(drop=True) + + # Call all-pairs Jaccard + all_pairs_jaccard_results = cugraph.all_pairs_jaccard( + G, vertices=cudf.Series(vertices, dtype="int32") + ) + + assert_frame_equal( + jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True + ) + + +@pytest.mark.sg +def test_all_pairs_jaccard_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + topk = 4 + + # Remove self loop + jaccard_results = ( + jaccard_results[jaccard_results["first"] != jaccard_results["second"]] + .sort_values(["jaccard_coeff", "first", "second"], ascending=False) + .reset_index(drop=True)[:topk] + ) + + # Call all-pairs Jaccard + all_pairs_jaccard_results = ( + cugraph.all_pairs_jaccard(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) + + assert_frame_equal( + jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py index 98f64906564..244718ce927 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py @@ -33,8 +33,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -49,6 +51,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -60,7 +64,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -123,6 +137,76 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Jaccard algo. + (based on cuGraph Jaccard) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + + if has_topk: + topk = 5 + else: + topk = None + + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 + + input_combo["vertices"] = vertices + input_combo["topk"] = topk + print("vertices ", vertices) + sg_cugraph_all_pairs_jaccard = cugraph.all_pairs_jaccard( + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_jaccard + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= @@ -164,3 +248,48 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output): assert len(jaccard_coeff_diffs1) == 0 assert len(jaccard_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_jaccard( + dask_client, benchmark, input_expected_output_all_pairs +): + + dg = input_expected_output_all_pairs["MGGraph"] + + use_weight = input_expected_output_all_pairs["is_weighted"] + + result_jaccard = benchmark( + dcg.all_pairs_jaccard, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, + ) + + result_jaccard = ( + result_jaccard.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"jaccard_coeff": "mg_cugraph_jaccard_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Jaccard results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_jaccard["sg_cugraph_jaccard_coeff"] = expected_output["jaccard_coeff"] + + jaccard_coeff_diffs1 = result_jaccard.query( + "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff > 0.00001" + ) + jaccard_coeff_diffs2 = result_jaccard.query( + "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff < -0.00001" + ) + + assert len(jaccard_coeff_diffs1) == 0 + assert len(jaccard_coeff_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 4b00330b6c9..f87fe06f691 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -20,7 +20,8 @@ import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal +import pandas as pd SRC_COL = "0" DST_COL = "1" @@ -114,6 +115,50 @@ def cpu_call(M, first, second): return result +def compare(src1, dst1, val1, src2, dst2, val2): + # + # We will do comparison computations by using dataframe + # merge functions (essentially doing fast joins). We + # start by making two data frames + # + df1 = cudf.DataFrame() + df1["src1"] = src1 + df1["dst1"] = dst1 + if val1 is not None: + df1["val1"] = val1 + + df2 = cudf.DataFrame() + df2["src2"] = src2 + df2["dst2"] = dst2 + if val2 is not None: + df2["val2"] = val2 + + # + # Check to see if all pairs in the original data frame + # still exist in the new data frame. If we join (merge) + # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) + # then we should get exactly the same number of entries in + # the data frame if we did not lose any data. + # + join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) + + if len(df1) != len(join): + join2 = df1.merge( + df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"] + ) + pd.set_option("display.max_rows", 500) + print("df1 = \n", df1.sort_values(["src1", "dst1"])) + print("df2 = \n", df2.sort_values(["src2", "dst2"])) + print( + "join2 = \n", + join2.sort_values(["src1", "dst1"]) + .to_pandas() + .query("src2.isnull()", engine="python"), + ) + + assert len(df1) == len(join) + + # ============================================================================= # Pytest Fixtures # ============================================================================= @@ -242,3 +287,106 @@ def test_weighted_overlap(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.overlap(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_overlap(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + # Remove self loop + overlap_results = overlap_results[ + overlap_results["first"] != overlap_results["second"] + ].reset_index(drop=True) + + all_pairs_overlap_results = cugraph.all_pairs_overlap(G) + + assert_frame_equal( + overlap_results.head(), + all_pairs_overlap_results.head(), + check_dtype=False, + check_like=True, + ) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_overlap_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + # Remove self loop + overlap_results = overlap_results[ + overlap_results["first"] != overlap_results["second"] + ].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = overlap_results["first"].isin(vertices) + mask_second = overlap_results["second"].isin(vertices) + # mask = [v in vertices for v in (overlap_results['first'].to_pandas() + # or overlap_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + overlap_results = overlap_results[mask].reset_index(drop=True) + + # Call all-pairs Overlap + all_pairs_overlap_results = cugraph.all_pairs_overlap( + G, vertices=cudf.Series(vertices, dtype="int32") + ) + + assert_frame_equal( + overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True + ) + + +@pytest.mark.sg +def test_all_pairs_overlap_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + topk = 10 + + # Remove self loop + overlap_results = ( + overlap_results[overlap_results["first"] != overlap_results["second"]] + .sort_values(["overlap_coeff", "first", "second"], ascending=False) + .reset_index(drop=True) # [:topk] + ) + print("overlap_results = \n", overlap_results) + + # Call all-pairs overlap + all_pairs_overlap_results = ( + cugraph.all_pairs_overlap(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) + + # 1. All pair similarity might return different top pairs k pairs + # which are still valid hence, ensure the pairs returned by all-pairs + # exists. + + compare( + all_pairs_overlap_results["first"], + all_pairs_overlap_results["second"], + all_pairs_overlap_results["overlap_coeff"], + overlap_results["first"], + overlap_results["second"], + overlap_results["overlap_coeff"], + ) + + # 2. Ensure the coefficient scores are still the highest + assert_series_equal( + all_pairs_overlap_results["overlap_coeff"], + overlap_results["overlap_coeff"][:topk], + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index 9afe7dd842f..aa238f6a6de 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -33,8 +33,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -49,6 +51,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -60,7 +64,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -123,6 +137,76 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Overlap algo. + (based on cuGraph Overlap) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + + if has_topk: + topk = 5 + else: + topk = None + + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 + + input_combo["vertices"] = vertices + input_combo["topk"] = topk + print("vertices ", vertices) + sg_cugraph_all_pairs_overlap = cugraph.all_pairs_overlap( + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_overlap + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= @@ -167,3 +251,48 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output): assert len(overlap_coeff_diffs1) == 0 assert len(overlap_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_overlap( + dask_client, benchmark, input_expected_output_all_pairs +): + + dg = input_expected_output_all_pairs["MGGraph"] + + use_weight = input_expected_output_all_pairs["is_weighted"] + + result_overlap = benchmark( + dcg.all_pairs_overlap, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, + ) + + result_overlap = ( + result_overlap.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"overlap_coeff": "mg_cugraph_overlap_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Overlap results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_overlap["sg_cugraph_overlap_coeff"] = expected_output["overlap_coeff"] + + overlap_coeff_diffs1 = result_overlap.query( + "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff > 0.00001" + ) + overlap_coeff_diffs2 = result_overlap.query( + "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff < -0.00001" + ) + + assert len(overlap_coeff_diffs1) == 0 + assert len(overlap_coeff_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 6345187a376..4c30f149ea5 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -15,12 +15,13 @@ import pytest import networkx as nx +import pandas as pd import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -156,6 +157,50 @@ def networkx_call(M, benchmark_callable=None): return src, dst, coeff +def compare(src1, dst1, val1, src2, dst2, val2): + # + # We will do comparison computations by using dataframe + # merge functions (essentially doing fast joins). We + # start by making two data frames + # + df1 = cudf.DataFrame() + df1["src1"] = src1 + df1["dst1"] = dst1 + if val1 is not None: + df1["val1"] = val1 + + df2 = cudf.DataFrame() + df2["src2"] = src2 + df2["dst2"] = dst2 + if val2 is not None: + df2["val2"] = val2 + + # + # Check to see if all pairs in the original data frame + # still exist in the new data frame. If we join (merge) + # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) + # then we should get exactly the same number of entries in + # the data frame if we did not lose any data. + # + join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) + + if len(df1) != len(join): + join2 = df1.merge( + df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"] + ) + pd.set_option("display.max_rows", 500) + print("df1 = \n", df1.sort_values(["src1", "dst1"])) + print("df2 = \n", df2.sort_values(["src2", "dst2"])) + print( + "join2 = \n", + join2.sort_values(["src1", "dst1"]) + .to_pandas() + .query("src2.isnull()", engine="python"), + ) + + assert len(df1) == len(join) + + # ============================================================================= # Pytest Fixtures # ============================================================================= @@ -337,3 +382,105 @@ def test_weighted_sorensen(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.sorensen(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_sorensen(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + # Remove self loop + sorensen_results = sorensen_results[ + sorensen_results["first"] != sorensen_results["second"] + ].reset_index(drop=True) + + all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G) + + assert_frame_equal( + sorensen_results.head(), + all_pairs_sorensen_results.head(), + check_dtype=False, + check_like=True, + ) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_sorensen_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + # Remove self loop + sorensen_results = sorensen_results[ + sorensen_results["first"] != sorensen_results["second"] + ].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = sorensen_results["first"].isin(vertices) + mask_second = sorensen_results["second"].isin(vertices) + # mask = [v in vertices for v in (sorensen_results['first'].to_pandas() + # or sorensen_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + sorensen_results = sorensen_results[mask].reset_index(drop=True) + + # Call all-pairs Sorensen + all_pairs_sorensen_results = cugraph.all_pairs_sorensen( + G, vertices=cudf.Series(vertices, dtype="int32") + ) + + assert_frame_equal( + sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True + ) + + +@pytest.mark.sg +def test_all_pairs_sorensen_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + topk = 4 + + # Remove self loop + sorensen_results = ( + sorensen_results[sorensen_results["first"] != sorensen_results["second"]] + .sort_values(["sorensen_coeff", "first", "second"], ascending=False) + .reset_index(drop=True)[:topk] + ) + + # Call all-pairs sorensen + all_pairs_sorensen_results = ( + cugraph.all_pairs_sorensen(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) + + # 1. All pair similarity might return different top pairs k pairs + # which are still valid hence, ensure the pairs returned by all-pairs + # exists. + + compare( + all_pairs_sorensen_results["first"], + all_pairs_sorensen_results["second"], + all_pairs_sorensen_results["sorensen_coeff"], + sorensen_results["first"], + sorensen_results["second"], + sorensen_results["sorensen_coeff"], + ) + + # 2. Ensure the coefficient scores are still the highest + assert_series_equal( + all_pairs_sorensen_results["sorensen_coeff"], + sorensen_results["sorensen_coeff"][:topk], + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py index 6c24fa5af13..e41daa64fb8 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py @@ -34,8 +34,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -50,6 +52,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -61,7 +65,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -124,6 +138,76 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Sorensen algo. + (based on cuGraph Sorensen) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + + if has_topk: + topk = 5 + else: + topk = None + + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 + + input_combo["vertices"] = vertices + print("vertices ", vertices, " is_weighted = ", is_weighted) + input_combo["topk"] = topk + sg_cugraph_all_pairs_sorensen = cugraph.all_pairs_sorensen( + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_sorensen + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= @@ -166,3 +250,48 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output): assert len(sorensen_coeff_diffs1) == 0 assert len(sorensen_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_sorensen( + dask_client, benchmark, input_expected_output_all_pairs +): + + dg = input_expected_output_all_pairs["MGGraph"] + + use_weight = input_expected_output_all_pairs["is_weighted"] + + result_sorensen = benchmark( + dcg.all_pairs_sorensen, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, + ) + + result_sorensen = ( + result_sorensen.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"sorensen_coeff": "mg_cugraph_sorensen_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph sorensen results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_sorensen["sg_cugraph_sorensen_coeff"] = expected_output["sorensen_coeff"] + + sorensen_coeff_diffs1 = result_sorensen.query( + "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff > 0.00001" + ) + sorensen_coeff_diffs2 = result_sorensen.query( + "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff < -0.00001" + ) + + assert len(sorensen_coeff_diffs1) == 0 + assert len(sorensen_coeff_diffs2) == 0 diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 7cc90145949..90fce23282e 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -39,6 +39,7 @@ set(cython_sources jaccard_coefficients.pyx sorensen_coefficients.pyx overlap_coefficients.pyx + cosine_coefficients.pyx katz_centrality.pyx leiden.pyx louvain.pyx @@ -58,6 +59,10 @@ set(cython_sources weakly_connected_components.pyx replicate_edgelist.pyx degrees.pyx + all_pairs_jaccard_coefficients.pyx + all_pairs_sorensen_coefficients.pyx + all_pairs_overlap_coefficients.pyx + all_pairs_cosine_coefficients.pyx ) set(linked_libraries cugraph::cugraph;cugraph::cugraph_c) diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index dcdef05e106..b67acc8bbfc 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -95,6 +95,16 @@ from pylibcugraph.sorensen_coefficients import sorensen_coefficients +from pylibcugraph.cosine_coefficients import cosine_coefficients + +from pylibcugraph.all_pairs_jaccard_coefficients import all_pairs_jaccard_coefficients + +from pylibcugraph.all_pairs_overlap_coefficients import all_pairs_overlap_coefficients + +from pylibcugraph.all_pairs_sorensen_coefficients import all_pairs_sorensen_coefficients + +from pylibcugraph.all_pairs_cosine_coefficients import all_pairs_cosine_coefficients + from pylibcugraph.degrees import in_degrees, out_degrees, degrees diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index 406094f18d5..71d094a6058 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -35,11 +35,14 @@ from pylibcugraph._cugraph_c.graph_functions cimport ( cdef extern from "cugraph_c/similarity_algorithms.h": + ########################################################################### - #""" ctypedef struct cugraph_similarity_result_t: pass - #""" + + cdef cugraph_vertex_pairs_t* \ + cugraph_similarity_result_get_vertex_pairs( + cugraph_similarity_result_t* result); cdef cugraph_type_erased_device_array_view_t* \ cugraph_similarity_result_get_similarity( @@ -64,6 +67,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_error_t** error ) + ########################################################################### + # all-pairs jaccard coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_jaccard_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) + ########################################################################### # sorensen coefficients cdef cugraph_error_code_t \ @@ -77,6 +94,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_error_t** error ) + ########################################################################### + # all-pairs sorensen coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_sorensen_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) + ########################################################################### # overlap coefficients cdef cugraph_error_code_t \ @@ -89,3 +120,44 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs overlap coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_overlap_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) + + ########################################################################### + # cosine coefficients + cdef cugraph_error_code_t \ + cugraph_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) + + ########################################################################### + # all-pairs cosine coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx new file mode 100644 index 00000000000..b600dd48567 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx @@ -0,0 +1,164 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_cosine_similarity_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX +) + + +def all_pairs_cosine_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Cosine similarity computation. + + Note that Cosine similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, then compute weighted cosine_coefficients( + the input graph must be weighted in that case). + Otherwise, compute non-weighted cosine_coefficients + + topk : size_t + Specify the number of answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Cosine coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + if topk is None: + topk = SIZE_MAX + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_cosine_similarity_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_cosine_similarity_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx new file mode 100644 index 00000000000..b65905b6850 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -0,0 +1,164 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_jaccard_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX +) + + +def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Jaccard similarity computation. + + Note that Jaccard similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, then compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, compute non-weighted jaccard_coefficients + + topk : size_t + Specify the number of answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Jaccard coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + if topk is None: + topk = SIZE_MAX + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_jaccard_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_jaccard_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx new file mode 100644 index 00000000000..74f3bc06a94 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -0,0 +1,164 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_overlap_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX +) + + +def all_pairs_overlap_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Overlap similarity computation. + + Note that Overlap similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, then compute weighted overlap_coefficients( + the input graph must be weighted in that case). + Otherwise, compute non-weighted overlap_coefficients + + topk : size_t + Specify the number of answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Overlap coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + if topk is None: + topk = SIZE_MAX + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_overlap_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_overlap_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx new file mode 100644 index 00000000000..5e3fc24a4b4 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -0,0 +1,164 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_sorensen_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX +) + + +def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Sorensen similarity computation. + + Note that Sorensen similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, then compute weighted sorensen_coefficients( + the input graph must be weighted in that case). + Otherwise, compute non-weighted sorensen_coefficients + + topk : size_t + Specify the number of answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Sorensen coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + if topk is None: + topk = SIZE_MAX + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_sorensen_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_sorensen_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx new file mode 100644 index 00000000000..df194fe364e --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx @@ -0,0 +1,171 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_cosine_similarity_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def cosine_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + first, + second, + bool_t use_weight, + bool_t do_expensive_check): + """ + Compute the Cosine coefficients for the specified vertex_pairs. + + Note that Cosine similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + first : + Source of the vertex pair. + + second : + Destination of the vertex pair. + + use_weight : bool, optional + If set to True, the compute weighted cosine_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted cosine_coefficients + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Cosine coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + # 'first' is a required parameter + cdef cugraph_type_erased_device_array_view_t* \ + first_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + first) + + # 'second' is a required parameter + cdef cugraph_type_erased_device_array_view_t* \ + second_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + second) + + error_code = cugraph_create_vertex_pairs(c_resource_handle_ptr, + c_graph_ptr, + first_view_ptr, + second_view_ptr, + &vertex_pairs_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "vertex_pairs") + + error_code = cugraph_cosine_similarity_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertex_pairs_ptr, + use_weight, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_cosine_similarity_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(first_view_ptr) + cugraph_type_erased_device_array_view_free(second_view_ptr) + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index 7fc140e9aed..21ab49a1f1e 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -57,3 +57,6 @@ cdef cugraph_type_erased_device_array_view_t* \ cdef create_cupy_array_view_for_device_ptr( cugraph_type_erased_device_array_view_t* device_array_view_ptr, owning_py_object) + +cdef extern from "stdint.h": + size_t SIZE_MAX