From 38d917a4687d38d6d8fc9ace95f51cc6fabb2da6 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Fri, 25 Aug 2023 18:37:31 -0700 Subject: [PATCH 01/16] Refactor python code for similarity algos to use latest CAPI, get rid of python wrapper around legacy code, update CAPI and python tests --- cpp/tests/c_api/similarity_test.c | 57 ++-- python/cugraph/CMakeLists.txt | 1 - .../cugraph/dask/link_prediction/jaccard.py | 2 +- .../cugraph/dask/link_prediction/overlap.py | 2 +- .../cugraph/dask/link_prediction/sorensen.py | 2 +- .../cugraph/cugraph/experimental/__init__.py | 36 +-- .../experimental/link_prediction/__init__.py | 13 - .../experimental/link_prediction/jaccard.py | 255 ------------------ .../experimental/link_prediction/overlap.py | 223 --------------- .../experimental/link_prediction/sorensen.py | 221 --------------- .../cugraph/link_prediction/CMakeLists.txt | 22 -- .../cugraph/link_prediction/__init__.py | 7 +- .../cugraph/link_prediction/jaccard.pxd | 35 --- .../cugraph/link_prediction/jaccard.py | 146 ++++++---- .../link_prediction/jaccard_wrapper.pyx | 155 ----------- .../cugraph/link_prediction/overlap.pxd | 35 --- .../cugraph/link_prediction/overlap.py | 168 +++++++++--- .../link_prediction/overlap_wrapper.pyx | 142 ---------- .../cugraph/link_prediction/sorensen.py | 158 ++++++----- .../cugraph/link_prediction/wjaccard.py | 82 +++--- .../cugraph/link_prediction/woverlap.py | 52 +--- .../cugraph/link_prediction/wsorensen.py | 52 +--- .../tests/link_prediction/test_jaccard.py | 180 +++++-------- .../tests/link_prediction/test_sorensen.py | 10 +- python/pylibcugraph/pylibcugraph/__init__.py | 7 + .../pylibcugraph/experimental/__init__.py | 16 +- .../pylibcugraph/jaccard_coefficients.pyx | 8 +- .../pylibcugraph/overlap_coefficients.pyx | 2 +- .../pylibcugraph/sorensen_coefficients.pyx | 2 +- 29 files changed, 532 insertions(+), 1559 deletions(-) delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/__init__.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/jaccard.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/overlap.py delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/sorensen.py delete mode 100644 python/cugraph/cugraph/link_prediction/CMakeLists.txt delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard.pxd delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx delete mode 100644 python/cugraph/cugraph/link_prediction/overlap.pxd delete mode 100644 python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index 20af3f3eccd..52f849ccd28 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -161,15 +161,16 @@ int test_jaccard() int test_weighted_jaccard() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -215,15 +216,16 @@ int test_sorensen() int test_weighted_sorensen() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(h_src, h_dst, @@ -269,15 +271,16 @@ int test_overlap() int test_weighted_overlap() { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(h_src, h_dst, @@ -301,8 +304,8 @@ int main(int argc, char** argv) result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); - // result |= RUN_TEST(test_weighted_jaccard); - // result |= RUN_TEST(test_weighted_sorensen); - // result |= RUN_TEST(test_weighted_overlap); + result |= RUN_TEST(test_weighted_jaccard); + result |= RUN_TEST(test_weighted_sorensen); + result |= RUN_TEST(test_weighted_overlap); return result; } diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt index fc58c4de89c..63b63fea4fe 100644 --- a/python/cugraph/CMakeLists.txt +++ b/python/cugraph/CMakeLists.txt @@ -89,7 +89,6 @@ add_subdirectory(cugraph/dask/structure) add_subdirectory(cugraph/internals) add_subdirectory(cugraph/layout) add_subdirectory(cugraph/linear_assignment) -add_subdirectory(cugraph/link_prediction) add_subdirectory(cugraph/structure) add_subdirectory(cugraph/tree) add_subdirectory(cugraph/utilities) diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index b3d688584a0..218e6206fc3 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index c47aeef3c72..5540be28fd1 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index bb5a3f44f39..24295ac330c 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -20,7 +20,7 @@ from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair -from pylibcugraph.experimental import ( +from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, ) from pylibcugraph import ResourceHandle diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index b96b760e634..5fa0a567675 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -48,29 +48,29 @@ experimental_warning_wrapper(EXPERIMENTAL__find_bicliques) ) -from cugraph.experimental.link_prediction.jaccard import ( - EXPERIMENTAL__jaccard, - EXPERIMENTAL__jaccard_coefficient, -) +# from cugraph.experimental.link_prediction.jaccard import ( +# EXPERIMENTAL__jaccard, +# EXPERIMENTAL__jaccard_coefficient, +# ) -jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard) -jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient) +# jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard) +# jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient) -from cugraph.experimental.link_prediction.sorensen import ( - EXPERIMENTAL__sorensen, - EXPERIMENTAL__sorensen_coefficient, -) +# from cugraph.experimental.link_prediction.sorensen import ( +# EXPERIMENTAL__sorensen, +# EXPERIMENTAL__sorensen_coefficient, +# ) -sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen) -sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient) +# sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen) +# sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient) -from cugraph.experimental.link_prediction.overlap import ( - EXPERIMENTAL__overlap, - EXPERIMENTAL__overlap_coefficient, -) +# from cugraph.experimental.link_prediction.overlap import ( +# EXPERIMENTAL__overlap, +# EXPERIMENTAL__overlap_coefficient, +# ) -overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap) -overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient) +# overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap) +# overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient) from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler diff --git a/python/cugraph/cugraph/experimental/link_prediction/__init__.py b/python/cugraph/cugraph/experimental/link_prediction/__init__.py deleted file mode 100644 index 081b2ae8260..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py b/python/cugraph/cugraph/experimental/link_prediction/jaccard.py deleted file mode 100644 index 2eba73b3824..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings - -from pylibcugraph.experimental import ( - jaccard_coefficients as pylibcugraph_jaccard_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__jaccard(G, vertex_pair=None, use_weight=False): - """ - Compute the Jaccard similarity between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context - of graphs, the neighborhood of a vertex is seen as a set. The Jaccard - similarity weight of each edge represents the strength of connection - between vertices based on the relative similarity of their neighbors. If - first is specified but second is not, or vice versa, an exception will be - thrown. - - NOTE: If the vertex_pair parameter is not specified then the behavior - of cugraph.jaccard is different from the behavior of - networkx.jaccard_coefficient. - - cugraph.jaccard, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the jaccard coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - networkx.jaccard_coefficient, in the absence of a specified vertex - pair list, will return an upper triangular dense matrix, excluding - the diagonal as well as vertex pairs that are directly connected - by an edge in the graph, of jaccard coefficients. Technically, networkx - returns a lazy iterator across this upper triangular matrix where - the actual jaccard coefficient is computed when the iterator is - dereferenced. Computing a dense matrix of results is not feasible - if the number of vertices in the graph is large (100,000 vertices - would result in 4.9 billion values in that iterator). - - If your graph is small enough (or you have enough memory and patience) - you can get the interesting (non-zero) values that are part of the networkx - solution by doing the following: - - >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> pairs = G.get_two_hop_neighbors() - >>> df = cugraph.jaccard(G, pairs) - - But please remember that cugraph will fill the dataframe with the entire - solution you request, so you'll need enough memory to store the 2-hop - neighborhood dataframe. - - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Jaccard weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['jaccard_coeff'] : cudf.Series - The computed jaccard coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import jaccard as exp_jaccard - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_jaccard(G) - - """ - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["jaccard_coeff"] = cudf.Series(jaccard_coeff) - - return df - - -def EXPERIMENTAL__jaccard_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `jaccard` - - Parameters - ---------- - graph : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Jaccard weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - ddf['first']: dask_cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - ddf['second']: dask_cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - ddf['jaccard_coeff']: dask_cudf.Series - The computed jaccard coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_jaccard_coefficient(G) - - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__jaccard(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="jaccard_coeff", src="first", dst="second" - ) - - return df diff --git a/python/cugraph/cugraph/experimental/link_prediction/overlap.py b/python/cugraph/cugraph/experimental/link_prediction/overlap.py deleted file mode 100644 index 0981ced4835..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/overlap.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings - -from pylibcugraph.experimental import ( - overlap_coefficients as pylibcugraph_overlap_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__overlap_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `overlap` - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the Overlap coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the overlap coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the overlap weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - ddf['first']: dask_cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - ddf['second']: dask_cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - ddf['overlap_coeff']: dask_cudf.Series - The computed overlap coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import overlap_coefficient as exp_overlap_coefficient - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_overlap_coefficient(G) - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__overlap(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="overlap_coeff", src="first", dst="second" - ) - - return df - - -def EXPERIMENTAL__overlap(G, vertex_pair=None, use_weight=False): - """ - Compute the Overlap Coefficient between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the - context of graphs, the neighborhood of a vertex is seen as a set. The - Overlap Coefficient weight of each edge represents the strength of - connection between vertices based on the relative similarity of their - neighbors. If first is specified but second is not, or vice versa, an - exception will be thrown. - - cugraph.overlap, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the overlap coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - adjacency list will be computed if not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the overlap coefficient is computed for the - given vertex pairs, else, it is computed for all vertex pairs. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Overlap coefficients. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['overlap_coeff'] : cudf.Series - The computed overlap coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import overlap as exp_overlap - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_overlap(G) - - """ - - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, overlap_coeff = pylibcugraph_overlap_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["overlap_coeff"] = cudf.Series(overlap_coeff) - - return df diff --git a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py b/python/cugraph/cugraph/experimental/link_prediction/sorensen.py deleted file mode 100644 index ed27e4813d3..00000000000 --- a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.utilities import ( - ensure_cugraph_obj_for_nx, - df_edge_score_to_dictionary, - renumber_vertex_pair, -) -import cudf -import warnings -from pylibcugraph.experimental import ( - sorensen_coefficients as pylibcugraph_sorensen_coefficients, -) -from pylibcugraph import ResourceHandle - - -# FIXME: Move this function to the utility module so that it can be -# shared by other algos -def ensure_valid_dtype(input_graph, vertex_pair): - - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] - vertex_pair_dtypes = vertex_pair.dtypes - - if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: - warning_msg = ( - "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. " - f"input graph's vertex type is: {vertex_dtype} and got " - f"'vertex_pair' of type: {vertex_pair_dtypes}." - ) - warnings.warn(warning_msg, UserWarning) - vertex_pair = vertex_pair.astype(vertex_dtype) - - return vertex_pair - - -def EXPERIMENTAL__sorensen(G, vertex_pair=None, use_weight=False): - """ - Compute the Sorensen coefficient between each pair of vertices connected by - an edge, or between arbitrary pairs of vertices specified by the user. - Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. - If first is specified but second is not, or vice versa, an exception will - be thrown. - - cugraph.sorensen, in the absence of a specified vertex pair list, will - compute the two_hop_neighbors of the entire graph to construct a vertex pair - list and will return the sorensen coefficient for those vertex pairs. This is - not advisable as the vertex_pairs can grow exponentially with respect to the - size of the datasets - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - - This implementation only supports undirected, unweighted Graph. - - vertex_pair : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the Sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the Sorensen coefficient for all - adjacent vertices in the graph. - - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Sorensen index. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['sorensen_coeff'] : cudf.Series - The computed sorensen coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import sorensen as exp_sorensen - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_sorensen(G) - - """ - if G.is_directed(): - raise ValueError("Input must be an undirected Graph.") - - if G.is_weighted(): - raise ValueError("Weighted graphs are currently not supported.") - - if use_weight: - raise ValueError("'use_weight' is currently not supported.") - - if vertex_pair is None: - # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() - - v_p_num_col = len(vertex_pair.columns) - - if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) - src_col_name = vertex_pair.columns[0] - dst_col_name = vertex_pair.columns[1] - first = vertex_pair[src_col_name] - second = vertex_pair[dst_col_name] - - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - use_weight = False - first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients( - resource_handle=ResourceHandle(), - graph=G._plc_graph, - first=first, - second=second, - use_weight=use_weight, - do_expensive_check=False, - ) - - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - - if v_p_num_col == 2: - # single column vertex - vertex_pair = vertex_pair.rename( - columns={src_col_name: "first", dst_col_name: "second"} - ) - - df = vertex_pair - df["sorensen_coeff"] = cudf.Series(sorensen_coeff) - - return df - - -def EXPERIMENTAL__sorensen_coefficient(G, ebunch=None, use_weight=False): - """ - For NetworkX Compatability. See `sorensen` - - Parameters - ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - ebunch : cudf.DataFrame, optional (default=None) - A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the sorensen coefficient for all - adjacent vertices in the graph. - use_weight : bool, optional (default=False) - Currently not supported - - Returns - ------- - df : cudf.DataFrame - GPU data frame of size E (the default) or the size of the given pairs - (first, second) containing the Sorensen weights. The ordering is - relative to the adjacency list, or that given by the specified vertex - pairs. - - df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified). - df['second'] : cudf.Series - The second vertex ID of each pair (will be identical to second if - specified). - df['sorensen_coeff'] : cudf.Series - The computed sorensen coefficient between the first and the second - vertex ID. - - Examples - -------- - >>> from cugraph.datasets import karate - >>> from cugraph.experimental import sorensen_coefficient as exp_sorensen_coef - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = exp_sorensen_coef(G) - - """ - vertex_pair = None - - G, isNx = ensure_cugraph_obj_for_nx(G) - - # FIXME: What is the logic behind this since the docstrings mention that 'G' and - # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? - if isNx is True and ebunch is not None: - vertex_pair = cudf.DataFrame(ebunch) - - df = EXPERIMENTAL__sorensen(G, vertex_pair) - - if isNx is True: - df = df_edge_score_to_dictionary( - df, k="sorensen_coeff", src="first", dst="second" - ) - - return df diff --git a/python/cugraph/cugraph/link_prediction/CMakeLists.txt b/python/cugraph/cugraph/link_prediction/CMakeLists.txt deleted file mode 100644 index a117cf9afc3..00000000000 --- a/python/cugraph/cugraph/link_prediction/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources jaccard_wrapper.pyx overlap_wrapper.pyx) -set(linked_libraries cugraph::cugraph) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX link_prediction_ - ASSOCIATED_TARGETS cugraph -) diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index a6911d3b8ae..2622c20f4c7 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -11,13 +11,18 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from cugraph.utilities.api_tools import deprecated_warning_wrapper from cugraph.link_prediction.jaccard import jaccard from cugraph.link_prediction.jaccard import jaccard_coefficient from cugraph.link_prediction.overlap import overlap from cugraph.link_prediction.wjaccard import jaccard_w +jaccard_w = deprecated_warning_wrapper(jaccard_w) from cugraph.link_prediction.woverlap import overlap_w +overlap_w = deprecated_warning_wrapper(overlap_w) from cugraph.link_prediction.wsorensen import sorensen_w +sorensen_w = deprecated_warning_wrapper(sorensen_w) from cugraph.link_prediction.jaccard import jaccard_coefficient from cugraph.link_prediction.sorensen import sorensen_coefficient from cugraph.link_prediction.sorensen import sorensen -from cugraph.link_prediction.overlap import overlap_coefficient +from cugraph.link_prediction.overlap import overlap_coefficient \ No newline at end of file diff --git a/python/cugraph/cugraph/link_prediction/jaccard.pxd b/python/cugraph/cugraph/link_prediction/jaccard.pxd deleted file mode 100644 index 9e8c82ec3d8..00000000000 --- a/python/cugraph/cugraph/link_prediction/jaccard.pxd +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.structure.graph_primtypes cimport * - - -cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": - - cdef void jaccard[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - WT *result) except + - - cdef void jaccard_list[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - ET num_pairs, - const VT *first, - const VT *second, - WT *result) except + diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index f1b488c8cca..6013d04ecef 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -11,16 +11,45 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf -from cugraph.link_prediction import jaccard_wrapper from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings + +from pylibcugraph import ( + jaccard_coefficients as pylibcugraph_jaccard_coefficients, +) +from pylibcugraph import ResourceHandle -def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + +# FIXME: +# 1. Be consistent with column names for output/result DataFrame +# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs +# 3. We need to add support for multi-column vertices + +def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -36,13 +65,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): of cugraph.jaccard is different from the behavior of networkx.jaccard_coefficient. - This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - cugraph.jaccard, in the absence of a specified vertex pair list, will - use the edges of the graph to construct a vertex pair list and will - return the jaccard coefficient for those vertex pairs. + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets networkx.jaccard_coefficient, in the absence of a specified vertex pair list, will return an upper triangular dense matrix, excluding @@ -59,9 +86,9 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): solution by doing the following: >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> pairs = G.get_two_hop_neighbors() - >>> df = cugraph.jaccard(G, pairs) + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> pairs = input_graph.get_two_hop_neighbors() + >>> df = cugraph.jaccard(input_graph, pairs) But please remember that cugraph will fill the dataframe with the entire solution you request, so you'll need enough memory to store the 2-hop @@ -72,11 +99,13 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The + as an edge list (edge weights are not supported yet for this algorithm). The graph should be undirected where an undirected edge is represented by a directed edge in both direction. The adjacency list will be computed if not already present. + This implementation only supports undirected, unweighted Graph. + vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the jaccard coefficient is computed for the @@ -84,9 +113,8 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): current implementation computes the jaccard coefficient for all adjacent vertices in the graph. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + use_weight : bool, optional (default=False) + Currently not supported Returns ------- @@ -96,10 +124,10 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): relative to the adjacency list, or that given by the specified vertex pairs. - df['source'] : cudf.Series - The source vertex ID (will be identical to first if specified). - df['destination'] : cudf.Series - The destination vertex ID (will be identical to second if + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if specified). df['jaccard_coeff'] : cudf.Series The computed jaccard coefficient between the first and the second @@ -108,55 +136,72 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.jaccard(G) + >>> from cugraph import jaccard + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = jaccard(input_graph) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") + + print ('use_weight =', use_weight) + + warnings.warn("do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning) if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") - if type(vertex_pair) == cudf.DataFrame: + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") + raise ValueError("vertex_pair must be a cudf Dataframe") - df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair) + first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + vertex_pair = input_graph.unrenumber(vertex_pair, src_col_name, preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, dst_col_name, preserve_order=True) + + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["jaccard_coeff"] = cudf.Series(jaccard_coeff) return df -def jaccard_coefficient(G, ebunch=None, do_expensive_check=True): +def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): """ For NetworkX Compatability. See `jaccard` - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - Parameters ---------- - graph : cugraph.Graph + G : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The + as an edge list (edge weights are not supported yet for this algorithm). The graph should be undirected where an undirected edge is represented by a directed edge in both direction. The adjacency list will be computed if not already present. @@ -168,6 +213,9 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=True): current implementation computes the jaccard coefficient for all adjacent vertices in the graph. + do_expensive_check : bool, optional (default=False) + Currently not supported + Returns ------- df : cudf.DataFrame diff --git a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx deleted file mode 100644 index e66d8bf0b5c..00000000000 --- a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.link_prediction.jaccard cimport jaccard as c_jaccard -from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list -from cugraph.structure.graph_primtypes cimport * -from cugraph.structure import graph_primtypes_wrapper -from libc.stdint cimport uintptr_t -import cudf -import numpy as np - - -def jaccard(input_graph, weights_arr=None, vertex_pair=None): - """ - Call jaccard or jaccard_list - """ - offsets = None - indices = None - - if input_graph.adjlist: - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, - input_graph.adjlist.indices], [np.int32]) - elif input_graph.transposedadjlist: - # - # NOTE: jaccard ONLY operates on an undirected graph, so CSR and CSC should be - # equivalent. The undirected check has already happened, so we'll just use - # the CSC as if it were CSR. - # - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, - input_graph.transposedadjlist.indices], [np.int32]) - else: - input_graph.view_adj_list() - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, - input_graph.adjlist.indices], [np.int32]) - - num_verts = input_graph.number_of_vertices() - num_edges = input_graph.number_of_edges(directed_edges=True) - - first = None - second = None - - cdef uintptr_t c_result_col = NULL - cdef uintptr_t c_first_col = NULL - cdef uintptr_t c_second_col = NULL - cdef uintptr_t c_src_index_col = NULL - cdef uintptr_t c_dst_index_col = NULL - cdef uintptr_t c_weights = NULL - cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0] - cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0] - - cdef GraphCSRView[int,int,float] graph_float - cdef GraphCSRView[int,int,double] graph_double - - weight_type = np.float32 - - if weights_arr is not None: - [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64]) - c_weights = weights.__cuda_array_interface__['data'][0] - weight_type = weights.dtype - - if type(vertex_pair) == cudf.DataFrame: - result_size = len(vertex_pair) - result = cudf.Series(np.ones(result_size, dtype=weight_type)) - c_result_col = result.__cuda_array_interface__['data'][0] - - df = cudf.DataFrame() - df['jaccard_coeff'] = result - - cols = vertex_pair.columns.to_list() - first = vertex_pair[cols[0]].astype(np.int32) - second = vertex_pair[cols[1]].astype(np.int32) - - # FIXME: multi column support - df['first'] = first - df['second'] = second - c_first_col = first.__cuda_array_interface__['data'][0] - c_second_col = second.__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - graph_float = GraphCSRView[int,int,float](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_jaccard_list[int,int,float](graph_float, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - else: - graph_double = GraphCSRView[int,int,double](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_jaccard_list[int,int,double](graph_double, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - - return df - else: - # error check performed in jaccard.py - assert vertex_pair is None - - df = cudf.DataFrame() - df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype)) - df['second'] = indices - - c_src_index_col = df['first'].__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32), - nan_as_null=False) - c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0] - - graph_float = GraphCSRView[int,int,float](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_jaccard[int,int,float](graph_float, - c_weights, - c_result_col) - - graph_float.get_source_indices(c_src_index_col) - else: - df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64), - nan_as_null=False) - c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0] - - graph_double = GraphCSRView[int,int,double](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_jaccard[int,int,double](graph_double, - c_weights, - c_result_col) - - graph_double.get_source_indices(c_src_index_col) - - return df diff --git a/python/cugraph/cugraph/link_prediction/overlap.pxd b/python/cugraph/cugraph/link_prediction/overlap.pxd deleted file mode 100644 index f0654472587..00000000000 --- a/python/cugraph/cugraph/link_prediction/overlap.pxd +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.structure.graph_primtypes cimport * - - -cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": - - cdef void overlap[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - WT *result) except + - - cdef void overlap_list[VT,ET,WT]( - const GraphCSRView[VT,ET,WT] &graph, - const WT *weights, - ET num_pairs, - const VT *first, - const VT *second, - WT *result) except + diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index 9bb7b76b0ca..231906951a0 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -11,28 +11,92 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.link_prediction import overlap_wrapper -import cudf from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings + +from pylibcugraph import ( + overlap_coefficients as pylibcugraph_overlap_coefficients, +) +from pylibcugraph import ResourceHandle + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair -def overlap_coefficient(G, ebunch=None, do_expensive_check=True): +def overlap_coefficient(G, ebunch=None, use_weight=False): """ For NetworkX Compatability. See `overlap` - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. + Parameters + ---------- + G : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + ebunch : cudf.DataFrame, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices. If provided, the Overlap coefficient is computed for the + given vertex pairs. If the vertex_pair is not provided then the + current implementation computes the overlap coefficient for all + adjacent vertices in the graph. + use_weight : bool, optional (default=False) + Currently not supported + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the overlap weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['overlap_coeff']: dask_cudf.Series + The computed overlap coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import overlap_coefficient + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = overlap_coefficient(G) """ vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) + # FIXME: What is the logic behind this since the docstrings mention that 'G' and + # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? if isNx is True and ebunch is not None: vertex_pair = cudf.DataFrame(ebunch) @@ -45,8 +109,12 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=True): return df +# FIXME: +# 1. Be consistent with column names for output/result DataFrame +# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs +# 3. We need to add support for multi-column vertices -def overlap(input_graph, vertex_pair=None, do_expensive_check=True): +def overlap(G, vertex_pair=None, use_weight=False): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -58,25 +126,28 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True): neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. + cugraph.overlap, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets Parameters ---------- - input_graph : cugraph.Graph + G : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The + as an edge list (edge weights are not supported yet for this algorithm). The adjacency list will be computed if not already present. + This implementation only supports undirected, unweighted Graph. + vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + use_weight : bool, optional (default=False) + Currently not supported Returns ------- @@ -98,35 +169,54 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.overlap(G) + >>> from cugraph import overlap + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = overlap(G) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + + if G.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = G.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): + vertex_pair = renumber_vertex_pair(G, vertex_pair) + vertex_pair = ensure_valid_dtype(G, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: raise ValueError("vertex_pair must be a cudf dataframe") - df = overlap_wrapper.overlap(input_graph, None, vertex_pair) + + first, second, overlap_coeff = pylibcugraph_overlap_coefficients( + resource_handle=ResourceHandle(), + graph=G._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) + + if G.renumbered: + vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) + vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + df = vertex_pair + df["overlap_coeff"] = cudf.Series(overlap_coeff) return df diff --git a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx deleted file mode 100644 index 0f61460a72f..00000000000 --- a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.link_prediction.overlap cimport overlap as c_overlap -from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list -from cugraph.structure.graph_primtypes cimport * -from cugraph.structure import graph_primtypes_wrapper -from libc.stdint cimport uintptr_t -import cudf -import numpy as np - - -def overlap(input_graph, weights_arr=None, vertex_pair=None): - """ - Call overlap or overlap_list - """ - - if not input_graph.adjlist: - input_graph.view_adj_list() - - [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32]) - - num_verts = input_graph.number_of_vertices() - num_edges = input_graph.number_of_edges(directed_edges=True) - - first = None - second = None - - cdef uintptr_t c_result_col = NULL - cdef uintptr_t c_first_col = NULL - cdef uintptr_t c_second_col = NULL - cdef uintptr_t c_src_index_col = NULL - cdef uintptr_t c_dst_index_col = NULL - cdef uintptr_t c_weights = NULL - cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0] - cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0] - - cdef GraphCSRView[int,int,float] graph_float - cdef GraphCSRView[int,int,double] graph_double - - weight_type = np.float32 - - if weights_arr is not None: - [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64]) - c_weights = weights.__cuda_array_interface__['data'][0] - weight_type = weights.dtype - - if type(vertex_pair) == cudf.DataFrame: - result_size = len(vertex_pair) - result = cudf.Series(np.ones(result_size, dtype=np.float32)) - c_result_col = result.__cuda_array_interface__['data'][0] - - df = cudf.DataFrame() - df['overlap_coeff'] = result - - cols = vertex_pair.columns.to_list() - first = vertex_pair[cols[0]] - second = vertex_pair[cols[1]] - - # FIXME: multi column support - df['first'] = first - df['second'] = second - c_first_col = first.__cuda_array_interface__['data'][0] - c_second_col = second.__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - graph_float = GraphCSRView[int,int,float](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_overlap_list[int,int,float](graph_float, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - else: - graph_double = GraphCSRView[int,int,double](c_offsets, c_indices, - c_weights, num_verts, num_edges) - c_overlap_list[int,int,double](graph_double, - c_weights, - result_size, - c_first_col, - c_second_col, - c_result_col) - - return df - else: - # error check performed in overlap.py - assert vertex_pair is None - - df = cudf.DataFrame() - df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype)) - df['second'] = indices - - c_src_index_col = df['first'].__cuda_array_interface__['data'][0] - - if weight_type == np.float32: - df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32), - nan_as_null=False) - c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0] - - graph_float = GraphCSRView[int,int,float](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_overlap[int,int,float](graph_float, - c_weights, - c_result_col) - - graph_float.get_source_indices(c_src_index_col) - else: - df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64), - nan_as_null=False) - c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0] - - graph_double = GraphCSRView[int,int,double](c_offsets, - c_indices, - c_weights, - num_verts, - num_edges) - c_overlap[int,int,double](graph_double, - c_weights, - c_result_col) - - graph_double.get_source_indices(c_src_index_col) - - return df diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 1d43adb51cd..cf80841c6c0 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -11,17 +11,43 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf -from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper from cugraph.utilities import ( ensure_cugraph_obj_for_nx, df_edge_score_to_dictionary, renumber_vertex_pair, ) +import cudf +import warnings +from pylibcugraph import ( + sorensen_coefficients as pylibcugraph_sorensen_coefficients, +) +from pylibcugraph import ResourceHandle + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair +# FIXME: +# 1. Be consistent with column names for output/result DataFrame +# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs +# 3. We need to add support for multi-column vertices -def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): +def sorensen(G, vertex_pair=None, use_weight=False): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -30,23 +56,23 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): If first is specified but second is not, or vice versa, an exception will be thrown. - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - cugraph.sorensen, in the absence of a specified vertex pair list, will - use the edges of the graph to construct a vertex pair list and will - return the sorensen coefficient for those vertex pairs. + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets Parameters ---------- - input_graph : cugraph.Graph + G : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The + as an edge list (edge weights are not supported yet for this algorithm). The graph should be undirected where an undirected edge is represented by a directed edge in both direction. The adjacency list will be computed if not already present. + This implementation only supports undirected, unweighted Graph. + vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the Sorensen coefficient is computed for the @@ -54,9 +80,8 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): current implementation computes the Sorensen coefficient for all adjacent vertices in the graph. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + use_weight : bool, optional (default=False) + Currently not supported Returns ------- @@ -67,65 +92,71 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): pairs. df['first'] : cudf.Series - The first vertex ID of each pair (will be identical to first if specified) - + The first vertex ID of each pair (will be identical to first if specified). df['second'] : cudf.Series The second vertex ID of each pair (will be identical to second if - specified) - + specified). df['sorensen_coeff'] : cudf.Series - The computed Sorensen coefficient between the source and - destination vertices + The computed sorensen coefficient between the first and the second + vertex ID. Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.sorensen(G) + >>> from cugraph import sorensen + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = sorensen(G) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + if G.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = G.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): + vertex_pair = renumber_vertex_pair(G, vertex_pair) + vertex_pair = ensure_valid_dtype(G, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + elif vertex_pair is not None: raise ValueError("vertex_pair must be a cudf dataframe") - df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair) - df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff) - df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True) - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") + first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients( + resource_handle=ResourceHandle(), + graph=G._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) + + if G.renumbered: + vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) + vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["sorensen_coeff"] = cudf.Series(sorensen_coeff) return df -def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): +def sorensen_coefficient(G, ebunch=None, use_weight=False): """ For NetworkX Compatability. See `sorensen` - NOTE: This algorithm doesn't currently support datasets with vertices that - are not (re)numebred vertices from 0 to V-1 where V is the total number of - vertices as this creates isolated vertices. - Parameters ---------- G : cugraph.Graph @@ -140,6 +171,8 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): given vertex pairs. If the vertex_pair is not provided then the current implementation computes the sorensen coefficient for all adjacent vertices in the graph. + use_weight : bool, optional (default=False) + Currently not supported Returns ------- @@ -149,10 +182,10 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): relative to the adjacency list, or that given by the specified vertex pairs. - df['source'] : cudf.Series - The source vertex ID (will be identical to first if specified). - df['destination'] : cudf.Series - The destination vertex ID (will be identical to second if + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if specified). df['sorensen_coeff'] : cudf.Series The computed sorensen coefficient between the first and the second @@ -161,14 +194,17 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): Examples -------- >>> from cugraph.datasets import karate - >>> G = karate.get_graph(download=True) - >>> df = cugraph.sorensen_coefficient(G) + >>> from cugraph import sorensen_coefficient + >>> G = karate.get_graph(download=True, ignore_weights=True) + >>> df = sorensen_coef(G) """ vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) + # FIXME: What is the logic behind this since the docstrings mention that 'G' and + # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame? if isNx is True and ebunch is not None: vertex_pair = cudf.DataFrame(ebunch) diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index e3486473fe5..1f623cd4dd1 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -12,12 +12,38 @@ # limitations under the License. from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper +# from cugraph.link_prediction import jaccard_wrapper +from cugraph.link_prediction import jaccard import cudf -from cugraph.utilities import renumber_vertex_pair +from pylibcugraph import ( + jaccard_coefficients as pylibcugraph_jaccard_coefficients, +) +from pylibcugraph import ResourceHandle +import warnings -def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype: + warning_msg = ( + "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + + + +def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): """ Compute the weighted Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -95,47 +121,9 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): >>> df = cugraph.jaccard_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - # The 'vertex' column of the cudf 'weights' also needs to be renumbered - # if the graph was renumbered - vertex_size = input_graph.vertex_column_size() - # single-column vertices i.e only one src and dst columns - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - # multi-column vertices i.e more than one src and dst columns - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - - jaccard_weights = weights["weight"] - df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + "jaccard_w is deprecated. To compute weighted jaccard, please use " + "jaccard(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, DeprecationWarning) + return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True) \ No newline at end of file diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index d7ebc5fc684..4a400c6b126 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -11,10 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.link_prediction import overlap_wrapper +# from cugraph.link_prediction import overlap_wrapper +from cugraph.link_prediction import overlap import cudf -from cugraph.utilities import renumber_vertex_pair - +import warnings def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ @@ -96,43 +96,9 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): ... len(weights['vertex']))] >>> df = cugraph.overlap_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - vertex_size = input_graph.vertex_column_size() - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - - overlap_weights = weights["weight"] - - overlap_weights = overlap_weights.astype("float32") - - df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + " overlap_w is deprecated. To compute weighted overlap, please use " + "overlap(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, DeprecationWarning) + return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index 8337b4602de..a9d3acf15b2 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -12,10 +12,9 @@ # limitations under the License. from cugraph.structure.graph_classes import Graph -from cugraph.link_prediction import jaccard_wrapper +from cugraph.link_prediction import sorensen import cudf -from cugraph.utilities import renumber_vertex_pair - +import warnings def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ @@ -93,44 +92,9 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): >>> df = cugraph.sorensen_w(G, weights) """ - if do_expensive_check: - if not input_graph.renumbered: - input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] - max_vertex = input_df.max().max() - expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( - input_df.dtypes[0] - ) - nodes = ( - cudf.concat([input_df["src"], input_df["dst"]]) - .unique() - .sort_values() - .reset_index(drop=True) - ) - if not expected_nodes.equals(nodes): - raise ValueError("Unrenumbered vertices are not supported.") - - if type(input_graph) is not Graph: - raise TypeError("input graph must a Graph") - - if type(vertex_pair) == cudf.DataFrame: - vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) - elif vertex_pair is not None: - raise ValueError("vertex_pair must be a cudf dataframe") - - if input_graph.renumbered: - vertex_size = input_graph.vertex_column_size() - if vertex_size == 1: - weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex") - else: - cols = weights.columns[:vertex_size].to_list() - weights = input_graph.add_internal_vertex_id(weights, "vertex", cols) - jaccard_weights = weights["weight"] - df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair) - df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff) - df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True) - - if input_graph.renumbered: - df = input_graph.unrenumber(df, "first") - df = input_graph.unrenumber(df, "second") - - return df + warning_msg = ( + f" sorensen_w is deprecated. To compute weighted sorensen, please use " + "sorensen(input_graph, vertex_pair=False, use_weight=True)" + ) + warnings.warn(warning_msg, DeprecationWarning) + return sorensen(input_graph, vertex_pair, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index cd883fb88f2..eac91b02311 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -11,6 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +#FIXME: Can we use global variables for column names instead of hardcoded ones? + import gc import pytest @@ -20,9 +22,9 @@ import cugraph from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS -from cugraph.experimental import jaccard as exp_jaccard +from cugraph import jaccard from cudf.testing import assert_series_equal, assert_frame_equal -from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient +from cugraph import jaccard_coefficient print("Networkx version : {} ".format(nx.__version__)) @@ -38,7 +40,9 @@ def setup_function(): # ============================================================================= # Helper functions # ============================================================================= -def compare_jaccard_two_hop(G, Gnx, edgevals=True): + + +def compare_jaccard_two_hop(G, Gnx, use_weight=False): """ Compute both cugraph and nx jaccard after extracting the two hop neighbors from G and compare both results @@ -49,29 +53,28 @@ def compare_jaccard_two_hop(G, Gnx, edgevals=True): .reset_index(drop=True) ) - nx_pairs = list(pairs.to_records(index=False)) - preds = nx.jaccard_coefficient(Gnx, nx_pairs) - nx_coeff = [] - for u, v, p in preds: - # print(u, " ", v, " ", p) - nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental jaccard currently only supports unweighted graphs - df_exp = exp_jaccard(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) - assert len(nx_coeff) == len(df) - for i in range(len(df)): - diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) - assert diff < 1.0e-6 + if not use_weight: + nx_pairs = list(pairs.to_records(index=False)) + preds = nx.jaccard_coefficient(Gnx, nx_pairs) + nx_coeff = [] + for u, v, p in preds: + nx_coeff.append(p) + + assert len(nx_coeff) == len(df) + for i in range(len(df)): + diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) + assert diff < 1.0e-6 + else: + #FIXME: compare results against resultset api + pass -def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None): +def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False): G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=not edgevals) + G = graph_file.get_graph(ignore_weights=not use_weight) # If no vertex_pair is passed as input, 'cugraph.jaccard' will # compute the 'jaccard_similarity' with the two_hop_neighbor of the @@ -82,7 +85,7 @@ def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None): vertex_pair = vertex_pair[["first", "second"]] # cugraph Jaccard Call - df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair) + df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight) df = df.sort_values(["first", "second"]).reset_index(drop=True) @@ -144,27 +147,34 @@ def read_csv(request): @pytest.mark.sg -def test_jaccard(read_csv, gpubenchmark): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard(read_csv, gpubenchmark, use_weight): M_cu, M, graph_file = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu) - nx_src, nx_dst, nx_coeff = networkx_call(M) + cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight) + if not use_weight: + nx_src, nx_dst, nx_coeff = networkx_call(M) - # Calculating mismatch - err = 0 - tol = 1.0e-06 + # Calculating mismatch + err = 0 + tol = 1.0e-06 - assert len(cu_coeff) == len(nx_coeff) - for i in range(len(cu_coeff)): - if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: - err += 1 + assert len(cu_coeff) == len(nx_coeff) + for i in range(len(cu_coeff)): + if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: + err += 1 - print("Mismatches: %d" % err) - assert err == 0 + print("Mismatches: %d" % err) + assert err == 0 + else: + #FIXME: compare weighted jaccard results against resultset api + pass + @pytest.mark.sg -def test_directed_graph_check(read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(read_csv, use_weight): _, M, _ = read_csv cu_M = cudf.DataFrame() @@ -180,7 +190,7 @@ def test_directed_graph_check(read_csv): vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] with pytest.raises(ValueError): - cugraph.jaccard(G1, vertex_pair) + cugraph.jaccard(G1, vertex_pair, use_weight) @pytest.mark.sg @@ -192,52 +202,43 @@ def test_nx_jaccard_time(read_csv, gpubenchmark): @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) -@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") -def test_jaccard_edgevals(gpubenchmark, graph_file): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) M_cu = utils.read_csv_file(dataset_path) cu_src, cu_dst, cu_coeff = cugraph_call( - gpubenchmark, netscience, edgevals=True, input_df=M_cu + gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight ) - nx_src, nx_dst, nx_coeff = networkx_call(M) + if not use_weight: + nx_src, nx_dst, nx_coeff = networkx_call(M) - # Calculating mismatch - err = 0 - tol = 1.0e-06 + # Calculating mismatch + err = 0 + tol = 1.0e-06 - assert len(cu_coeff) == len(nx_coeff) - for i in range(len(cu_coeff)): - if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: - err += 1 + assert len(cu_coeff) == len(nx_coeff) + for i in range(len(cu_coeff)): + if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1: + err += 1 - print("Mismatches: %d" % err) - assert err == 0 + print("Mismatches: %d" % err) + assert err == 0 + else: + #FIXME: compare results against resultset api + pass @pytest.mark.sg -def test_jaccard_two_hop(read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_two_hop(read_csv, use_weight): _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - G = graph_file.get_graph(ignore_weights=True) - - compare_jaccard_two_hop(G, Gnx) - - -@pytest.mark.sg -def test_jaccard_two_hop_edge_vals(read_csv): - - _, M, graph_file = read_csv - - Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() - ) - - G = graph_file.get_graph() + G = graph_file.get_graph(ignore_weights= not use_weight) - compare_jaccard_two_hop(G, Gnx, edgevals=True) + compare_jaccard_two_hop(G, Gnx, use_weight) @pytest.mark.sg @@ -252,10 +253,8 @@ def test_jaccard_nx(read_csv): ebunch = M_cu.rename(columns={"0": "first", "1": "second"}) ebunch = ebunch[["first", "second"]] cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch) - cg_j_exp = exp_jaccard_coefficient(Gnx, ebunch=ebunch) assert len(nv_js) > len(cg_j) - assert len(nv_js) > len(cg_j_exp) # FIXME: Nx does a full all-pair Jaccard. # cuGraph does a limited 1-hop Jaccard @@ -280,51 +279,18 @@ def test_jaccard_multi_column(read_csv): vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] - df_res = cugraph.jaccard(G1, vertex_pair) - df_plc_exp = exp_jaccard(G1, vertex_pair) + df_multi_col_res = cugraph.jaccard(G1, vertex_pair) + jaccard_multi_col_res = df_multi_col_res["jaccard_coeff"].sort_values().reset_index(drop=True) - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) + print(df_multi_col_res) - jaccard_res = df_res["jaccard_coeff"].sort_values().reset_index(drop=True) - jaccard_plc_exp = df_plc_exp["jaccard_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(jaccard_res, jaccard_plc_exp) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) + df_single_col_res = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) - - -@pytest.mark.sg -def test_weighted_exp_jaccard(): - karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - exp_jaccard(G) - - G = karate.get_graph(ignore_weights=True) - use_weight = True - with pytest.raises(ValueError): - exp_jaccard(G, use_weight=use_weight) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values("first").reset_index() - -@pytest.mark.sg -def test_invalid_datasets_jaccard(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.jaccard(G) + assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 3da33a3e853..5d9bc1d4c98 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -20,7 +20,7 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience -from cugraph.experimental import sorensen as exp_sorensen +from cugraph import sorensen from cudf.testing import assert_series_equal, assert_frame_equal @@ -62,7 +62,7 @@ def compare_sorensen_two_hop(G, Gnx, edgevals=False): df = df.sort_values(by=["first", "second"]).reset_index(drop=True) if not edgevals: # experimental sorensen currently only supports unweighted graphs - df_exp = exp_sorensen(G, pairs) + df_exp = sorensen(G, pairs) df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) assert len(nx_coeff) == len(df) @@ -244,7 +244,7 @@ def test_sorensen_multi_column(read_csv): vertex_pair = vertex_pair[:5] df_res = cugraph.sorensen(G1, vertex_pair) - df_plc_exp = exp_sorensen(G1, vertex_pair) + df_plc_exp = sorensen(G1, vertex_pair) df_plc_exp = df_plc_exp.rename( columns={ @@ -273,12 +273,12 @@ def test_weighted_exp_sorensen(): karate = UNDIRECTED_DATASETS[0] G = karate.get_graph() with pytest.raises(ValueError): - exp_sorensen(G) + sorensen(G) G = karate.get_graph(ignore_weights=True) use_weight = True with pytest.raises(ValueError): - exp_sorensen(G, use_weight=use_weight) + sorensen(G, use_weight=use_weight) @pytest.mark.sg diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 711652bbae6..45f6de2f663 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -87,6 +87,13 @@ from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists +from pylibcugraph.jaccard_coefficients import jaccard_coefficients + +from pylibcugraph.overlap_coefficients import overlap_coefficients + +from pylibcugraph.sorensen_coefficients import sorensen_coefficients + + from pylibcugraph import exceptions __version__ = "23.10.00" diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 1b93f9322af..39ce2ab46a1 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -76,16 +76,16 @@ node2vec = promoted_experimental_warning_wrapper(node2vec) -from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients +# from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients -jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients) +# jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients) -from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients +# from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients -overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients) +# overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients) -from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients +# from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients -sorensen_coefficients = experimental_warning_wrapper( - EXPERIMENTAL__sorensen_coefficients -) +# sorensen_coefficients = experimental_warning_wrapper( + # EXPERIMENTAL__sorensen_coefficients +# ) diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx index 805ee821eab..f5252e55be7 100644 --- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx @@ -15,6 +15,8 @@ # cython: language_level = 3 from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -57,7 +59,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle, +def jaccard_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, @@ -83,8 +85,7 @@ def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure @@ -131,6 +132,7 @@ def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle, &error_ptr) assert_success(error_code, error_ptr, "vertex_pairs") + print('cugraph_jaccard_coefficients use_weight:', use_weight) error_code = cugraph_jaccard_coefficients(c_resource_handle_ptr, c_graph_ptr, vertex_pairs_ptr, diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx index 6af71116469..f87df7d264c 100644 --- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx @@ -57,7 +57,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle, +def overlap_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx index 12647baccb2..d7fc834c8ec 100644 --- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx @@ -57,7 +57,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle, +def sorensen_coefficients(ResourceHandle resource_handle, _GPUGraph graph, first, second, From 052e7bc87df160140350c365a743c99d2e0a24aa Mon Sep 17 00:00:00 2001 From: Md Naim Date: Fri, 25 Aug 2023 18:51:45 -0700 Subject: [PATCH 02/16] Remove legacy jaccard and overlap .cu files --- cpp/src/link_prediction/legacy/jaccard.cu | 429 ---------------------- cpp/src/link_prediction/legacy/overlap.cu | 425 --------------------- 2 files changed, 854 deletions(-) delete mode 100644 cpp/src/link_prediction/legacy/jaccard.cu delete mode 100644 cpp/src/link_prediction/legacy/overlap.cu diff --git a/cpp/src/link_prediction/legacy/jaccard.cu b/cpp/src/link_prediction/legacy/jaccard.cu deleted file mode 100644 index d0b240e3c77..00000000000 --- a/cpp/src/link_prediction/legacy/jaccard.cu +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include - -namespace cugraph { -namespace detail { - -// Volume of neighboors (*weight_s) -template -__global__ void jaccard_row_sum( - vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work) -{ - vertex_t row; - edge_t start, end, length; - weight_t sum; - - for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - - // compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) work[row] = sum; - } else { - work[row] = static_cast(length); - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -template -__global__ void jaccard_is(vertex_t n, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, j, Ni, Nj; - vertex_t row, col; - vertex_t ref, cur, ref_col, cur_col, match; - weight_t ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[j] = work[row] + work[col]; - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[j], ref_val); } - } - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// Using list of node pairs -template -__global__ void jaccard_is_pairs(edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, idx, Ni, Nj, match; - vertex_t row, col, ref, cur, ref_col, cur_col; - weight_t ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[idx] = work[row] + work[col]; - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[idx], ref_val); } - } - } -} - -// Jaccard weights (*weight) -template -__global__ void jaccard_jw(edge_t e, - weight_t const* weight_i, - weight_t const* weight_s, - weight_t* weight_j) -{ - edge_t j; - weight_t Wi, Ws, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Ws = weight_s[j]; - Wu = Ws - Wi; - weight_j[j] = (Wi / Wu); - } -} - -template -int jaccard(vertex_t n, - edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - rmm::cuda_stream_view stream_view; - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - jaccard_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - - thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - jaccard_is<<>>( - n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - jaccard_jw - <<>>(e, weight_i, weight_s, weight_j); - - return 0; -} - -template -int jaccard_pairs(vertex_t n, - edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - jaccard_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - - // NOTE: initilized weight_i vector with 0.0 - // fill(num_pairs, weight_i, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - jaccard_is_pairs<<>>( - num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (edge_t)CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - jaccard_jw - <<>>(num_pairs, weight_i, weight_s, weight_j); - - return 0; -} -} // namespace detail - -template -void jaccard(legacy::GraphCSRView const& graph, WT const* weights, WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - - rmm::device_vector weight_i(graph.number_of_edges); - rmm::device_vector weight_s(graph.number_of_edges); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::jaccard(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::jaccard(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template -void jaccard_list(legacy::GraphCSRView const& graph, - WT const* weights, - ET num_pairs, - VT const* first, - VT const* second, - WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL"); - CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL"); - - rmm::device_vector weight_i(num_pairs, WT{0.0}); - rmm::device_vector weight_s(num_pairs); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::jaccard_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::jaccard_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template void jaccard(legacy::GraphCSRView const&, - float const*, - float*); -template void jaccard( - legacy::GraphCSRView const&, double const*, double*); -template void jaccard(legacy::GraphCSRView const&, - float const*, - float*); -template void jaccard( - legacy::GraphCSRView const&, double const*, double*); -template void jaccard_list( - legacy::GraphCSRView const&, - float const*, - int32_t, - int32_t const*, - int32_t const*, - float*); -template void jaccard_list( - legacy::GraphCSRView const&, - double const*, - int32_t, - int32_t const*, - int32_t const*, - double*); -template void jaccard_list( - legacy::GraphCSRView const&, - float const*, - int64_t, - int64_t const*, - int64_t const*, - float*); -template void jaccard_list( - legacy::GraphCSRView const&, - double const*, - int64_t, - int64_t const*, - int64_t const*, - double*); - -} // namespace cugraph diff --git a/cpp/src/link_prediction/legacy/overlap.cu b/cpp/src/link_prediction/legacy/overlap.cu deleted file mode 100644 index 67d7cd5e4c6..00000000000 --- a/cpp/src/link_prediction/legacy/overlap.cu +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace cugraph { -namespace detail { - -// Volume of neighboors (*weight_s) -// TODO: Identical kernel to jaccard_row_sum!! -template -__global__ void overlap_row_sum( - vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work) -{ - vertex_t row; - edge_t start, end, length; - weight_t sum; - - for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - - // compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) work[row] = sum; - } else { - work[row] = static_cast(length); - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// TODO: Identical kernel to jaccard_row_sum!! -template -__global__ void overlap_is(vertex_t n, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, j, Ni, Nj; - vertex_t row, col; - vertex_t ref, cur, ref_col, cur_col, match; - weight_t ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[j] = min(work[row], work[col]); - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[j], ref_val); } - } - } - } -} - -// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) -// Using list of node pairs -// NOTE: NOT the same as jaccard -template -__global__ void overlap_is_pairs(edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* v, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s) -{ - edge_t i, idx, Ni, Nj, match; - vertex_t row, col, ref, cur, ref_col, cur_col; - weight_t ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - - // find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - // compute new sum weights - weight_s[idx] = min(work[row], work[col]); - - // compute new intersection weights - // search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } else { - ref_val = 1.0; - } - - // binary search (column indices are sorted within each row) - edge_t left = csrPtr[cur]; - edge_t right = csrPtr[cur + 1] - 1; - while (left <= right) { - edge_t middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } else if (cur_col < ref_col) { - left = middle + 1; - } else { - match = middle; - break; - } - } - - // if the element with the same column index in the reference row has been found - if (match != -1) { atomicAdd(&weight_i[idx], ref_val); } - } - } -} - -// Overlap weights (*weight) -template -__global__ void overlap_jw(edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - edge_t j; - weight_t Wi, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Wu = weight_s[j]; - weight_j[j] = (Wi / Wu); - } -} - -template -int overlap(vertex_t n, - edge_t e, - edge_t const* csrPtr, - vertex_t const* csrInd, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - - // launch kernel - overlap_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - fill(e, weight_i, weight_t{0.0}); - - // setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - overlap_is - <<>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - - // launch kernel - overlap_jw - <<>>(e, csrPtr, csrInd, weight_i, weight_s, weight_j); - - return 0; -} - -template -int overlap_pairs(vertex_t n, - edge_t num_pairs, - edge_t const* csrPtr, - vertex_t const* csrInd, - vertex_t const* first_pair, - vertex_t const* second_pair, - weight_t const* weight_in, - weight_t* work, - weight_t* weight_i, - weight_t* weight_s, - weight_t* weight_j) -{ - dim3 nthreads, nblocks; - int y = 4; - - // setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS}); - nblocks.z = 1; - // launch kernel - - overlap_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - fill(num_pairs, weight_i, weight_t{0.0}); - // setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; - - // launch kernel - overlap_is_pairs<<>>( - num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s); - - // setup launch configuration - nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS}); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS}); - nblocks.y = 1; - nblocks.z = 1; - // launch kernel - - overlap_jw - <<>>(num_pairs, csrPtr, csrInd, weight_i, weight_s, weight_j); - - return 0; -} -} // namespace detail - -template -void overlap(legacy::GraphCSRView const& graph, WT const* weights, WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - - rmm::device_vector weight_i(graph.number_of_edges); - rmm::device_vector weight_s(graph.number_of_edges); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::overlap(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::overlap(graph.number_of_vertices, - graph.number_of_edges, - graph.offsets, - graph.indices, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template -void overlap_list(legacy::GraphCSRView const& graph, - WT const* weights, - ET num_pairs, - VT const* first, - VT const* second, - WT* result) -{ - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL"); - CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL"); - CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL"); - - rmm::device_vector weight_i(num_pairs); - rmm::device_vector weight_s(num_pairs); - rmm::device_vector work(graph.number_of_vertices); - - if (weights == nullptr) { - cugraph::detail::overlap_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } else { - cugraph::detail::overlap_pairs(graph.number_of_vertices, - num_pairs, - graph.offsets, - graph.indices, - first, - second, - weights, - work.data().get(), - weight_i.data().get(), - weight_s.data().get(), - result); - } -} - -template void overlap(legacy::GraphCSRView const&, - float const*, - float*); -template void overlap( - legacy::GraphCSRView const&, double const*, double*); -template void overlap(legacy::GraphCSRView const&, - float const*, - float*); -template void overlap( - legacy::GraphCSRView const&, double const*, double*); -template void overlap_list( - legacy::GraphCSRView const&, - float const*, - int32_t, - int32_t const*, - int32_t const*, - float*); -template void overlap_list( - legacy::GraphCSRView const&, - double const*, - int32_t, - int32_t const*, - int32_t const*, - double*); -template void overlap_list( - legacy::GraphCSRView const&, - float const*, - int64_t, - int64_t const*, - int64_t const*, - float*); -template void overlap_list( - legacy::GraphCSRView const&, - double const*, - int64_t, - int64_t const*, - int64_t const*, - double*); - -} // namespace cugraph From 2af9b909b1d935c1547c1dfde1c885f4999fbfed Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 08:47:17 -0700 Subject: [PATCH 03/16] Update CMakeLists.txt and fixes copyright issues --- .pre-commit-config.yaml | 10 ++--- cpp/CMakeLists.txt | 2 - .../cugraph/cugraph/experimental/__init__.py | 24 ---------- .../cugraph/link_prediction/__init__.py | 7 ++- .../cugraph/link_prediction/jaccard.py | 23 ++++++---- .../tests/link_prediction/test_jaccard.py | 45 ++++++++++--------- .../pylibcugraph/experimental/__init__.py | 4 +- .../pylibcugraph/jaccard_coefficients.pyx | 2 +- .../pylibcugraph/overlap_coefficients.pyx | 2 +- .../pylibcugraph/sorensen_coefficients.pyx | 2 +- 10 files changed, 53 insertions(+), 68 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f05aedf1a1..3cc848762f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,14 +11,14 @@ repos: - id: debug-statements - id: mixed-line-ending - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.7.0 hooks: - id: black language_version: python3 args: [--target-version=py38] files: ^python/ - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 6.1.0 hooks: - id: flake8 args: ["--config=.flake8"] @@ -27,13 +27,13 @@ repos: types_or: [python] # TODO: Enable [python, cython] additional_dependencies: ["flake8-force"] - repo: https://github.com/asottile/yesqa - rev: v1.3.0 + rev: v1.5.0 hooks: - id: yesqa additional_dependencies: - flake8==6.0.0 - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v16.0.1 + rev: v16.0.6 hooks: - id: clang-format exclude: | @@ -52,7 +52,7 @@ repos: pass_filenames: false additional_dependencies: [gitpython] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.5.1 + rev: v1.5.2 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 87d26bfd848..d0ee40aac3a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -196,8 +196,6 @@ set(CUGRAPH_SOURCES src/structure/legacy/graph.cu src/linear_assignment/legacy/hungarian.cu src/traversal/legacy/bfs.cu - src/link_prediction/legacy/jaccard.cu - src/link_prediction/legacy/overlap.cu src/link_prediction/jaccard_sg.cu src/link_prediction/sorensen_sg.cu src/link_prediction/overlap_sg.cu diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index 5fa0a567675..4140f5bdf02 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -48,30 +48,6 @@ experimental_warning_wrapper(EXPERIMENTAL__find_bicliques) ) -# from cugraph.experimental.link_prediction.jaccard import ( -# EXPERIMENTAL__jaccard, -# EXPERIMENTAL__jaccard_coefficient, -# ) - -# jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard) -# jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient) - -# from cugraph.experimental.link_prediction.sorensen import ( -# EXPERIMENTAL__sorensen, -# EXPERIMENTAL__sorensen_coefficient, -# ) - -# sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen) -# sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient) - -# from cugraph.experimental.link_prediction.overlap import ( -# EXPERIMENTAL__overlap, -# EXPERIMENTAL__overlap_coefficient, -# ) - -# overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap) -# overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient) - from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler) diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index 2622c20f4c7..c90bdaacf34 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -17,12 +17,15 @@ from cugraph.link_prediction.jaccard import jaccard_coefficient from cugraph.link_prediction.overlap import overlap from cugraph.link_prediction.wjaccard import jaccard_w + jaccard_w = deprecated_warning_wrapper(jaccard_w) from cugraph.link_prediction.woverlap import overlap_w + overlap_w = deprecated_warning_wrapper(overlap_w) from cugraph.link_prediction.wsorensen import sorensen_w + sorensen_w = deprecated_warning_wrapper(sorensen_w) from cugraph.link_prediction.jaccard import jaccard_coefficient from cugraph.link_prediction.sorensen import sorensen_coefficient from cugraph.link_prediction.sorensen import sorensen -from cugraph.link_prediction.overlap import overlap_coefficient \ No newline at end of file +from cugraph.link_prediction.overlap import overlap_coefficient diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 7713aab4dd9..4b628400dc6 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -28,7 +28,6 @@ # FIXME: Move this function to the utility module so that it can be # shared by other algos def ensure_valid_dtype(input_graph, vertex_pair): - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] vertex_pair_dtypes = vertex_pair.dtypes @@ -44,12 +43,13 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -# FIXME: +# FIXME: # 1. Be consistent with column names for output/result DataFrame # 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs # 3. We need to add support for multi-column vertices -def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): + +def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -142,10 +142,12 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= """ - print ('use_weight =', use_weight) + print("use_weight =", use_weight) - warnings.warn("do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning) + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") @@ -177,9 +179,12 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= ) if input_graph.renumbered: - vertex_pair = input_graph.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = input_graph.unrenumber(vertex_pair, dst_col_name, preserve_order=True) - + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) if v_p_num_col == 2: # single column vertex diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index eac91b02311..05ad82ec6de 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -#FIXME: Can we use global variables for column names instead of hardcoded ones? +# FIXME: Can we use global variables for column names instead of hardcoded ones? import gc @@ -22,9 +22,9 @@ import cugraph from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS -from cugraph import jaccard -from cudf.testing import assert_series_equal, assert_frame_equal -from cugraph import jaccard_coefficient +from cudf.testing import assert_series_equal + +# from cugraph import jaccard_coefficient print("Networkx version : {} ".format(nx.__version__)) @@ -68,7 +68,7 @@ def compare_jaccard_two_hop(G, Gnx, use_weight=False): diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) assert diff < 1.0e-6 else: - #FIXME: compare results against resultset api + # FIXME: compare results against resultset api pass @@ -81,11 +81,18 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False # entire graph while nx compute with the one_hop_neighbor. For better # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.jaccard' # and pass it as vertex_pair - vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) - vertex_pair = vertex_pair[["first", "second"]] + if isinstance(input_df, cudf.DataFrame): + vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) + vertex_pair = vertex_pair[["first", "second"]] + else: + vertex_pair = cudf.DataFrame( + columns=["first", "second"], dtype=G.edgelist.edgelist_df["src"].dtype + ) # cugraph Jaccard Call - df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight) + df = benchmark_callable( + cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight + ) df = df.sort_values(["first", "second"]).reset_index(drop=True) @@ -97,7 +104,6 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False def networkx_call(M, benchmark_callable=None): - sources = M["0"] destinations = M["1"] edges = [] @@ -149,9 +155,10 @@ def read_csv(request): @pytest.mark.sg @pytest.mark.parametrize("use_weight", [False, True]) def test_jaccard(read_csv, gpubenchmark, use_weight): - M_cu, M, graph_file = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight + ) if not use_weight: nx_src, nx_dst, nx_coeff = networkx_call(M) @@ -167,9 +174,8 @@ def test_jaccard(read_csv, gpubenchmark, use_weight): print("Mismatches: %d" % err) assert err == 0 else: - #FIXME: compare weighted jaccard results against resultset api + # FIXME: compare weighted jaccard results against resultset api pass - @pytest.mark.sg @@ -195,7 +201,6 @@ def test_directed_graph_check(read_csv, use_weight): @pytest.mark.sg def test_nx_jaccard_time(read_csv, gpubenchmark): - _, M, _ = read_csv nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) @@ -225,25 +230,23 @@ def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight): print("Mismatches: %d" % err) assert err == 0 else: - #FIXME: compare results against resultset api + # FIXME: compare results against resultset api pass @pytest.mark.sg @pytest.mark.parametrize("use_weight", [False, True]) def test_jaccard_two_hop(read_csv, use_weight): - _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - G = graph_file.get_graph(ignore_weights= not use_weight) + G = graph_file.get_graph(ignore_weights=not use_weight) compare_jaccard_two_hop(G, Gnx, use_weight) @pytest.mark.sg def test_jaccard_nx(read_csv): - M_cu, M, _ = read_csv Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) @@ -263,7 +266,6 @@ def test_jaccard_nx(read_csv): @pytest.mark.sg def test_jaccard_multi_column(read_csv): - _, M, _ = read_csv cu_M = cudf.DataFrame() @@ -280,11 +282,12 @@ def test_jaccard_multi_column(read_csv): vertex_pair = vertex_pair[:5] df_multi_col_res = cugraph.jaccard(G1, vertex_pair) - jaccard_multi_col_res = df_multi_col_res["jaccard_coeff"].sort_values().reset_index(drop=True) + # jaccard_multi_col_res = ( + # df_multi_col_res["jaccard_coeff"].sort_values().reset_index(drop=True) + # ) print(df_multi_col_res) - G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_single_col_res = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 39ce2ab46a1..c09899d8eda 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -87,5 +87,5 @@ # from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients # sorensen_coefficients = experimental_warning_wrapper( - # EXPERIMENTAL__sorensen_coefficients +# EXPERIMENTAL__sorensen_coefficients # ) diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx index f5252e55be7..7c672153cf4 100644 --- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx index f87df7d264c..3e75abf94f2 100644 --- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx index d7fc834c8ec..9dd5d5fd13e 100644 --- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 69a036fd0af01c3e0802de36e3412b4acad7f301 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 11:09:23 -0700 Subject: [PATCH 04/16] Add test for sorensen --- .../tests/link_prediction/test_jaccard.py | 6 - .../tests/link_prediction/test_sorensen.py | 166 ++++++++---------- .../tests/link_prediction/test_wjaccard.py | 133 +------------- 3 files changed, 82 insertions(+), 223 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index 05ad82ec6de..eeaa4fe3bb0 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -282,11 +282,6 @@ def test_jaccard_multi_column(read_csv): vertex_pair = vertex_pair[:5] df_multi_col_res = cugraph.jaccard(G1, vertex_pair) - # jaccard_multi_col_res = ( - # df_multi_col_res["jaccard_coeff"].sort_values().reset_index(drop=True) - # ) - - print(df_multi_col_res) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") @@ -295,5 +290,4 @@ def test_jaccard_multi_column(read_csv): # Calculating mismatch actual = df_multi_col_res.sort_values("0_src").reset_index() expected = df_single_col_res.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 5d9bc1d4c98..bcdd8d9d0a9 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -20,9 +20,14 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience -from cugraph import sorensen -from cudf.testing import assert_series_equal, assert_frame_equal +from cudf.testing import assert_series_equal +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +SORENSEN_COEFF_COL = "sorensen_coeff" +EDGE_ATT_COL = "weight" print("Networkx version : {} ".format(nx.__version__)) @@ -37,68 +42,85 @@ def setup_function(): # ============================================================================= # Helper functions # ============================================================================= -def compare_sorensen_two_hop(G, Gnx, edgevals=False): +def compare_sorensen_two_hop(G, Gnx, use_weight=False): """ Compute both cugraph and nx sorensen after extracting the two hop neighbors from G and compare both results """ pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) - nx_pairs = [] - nx_pairs = list(pairs.to_records(index=False)) - preds = nx.jaccard_coefficient(Gnx, nx_pairs) - nx_coeff = [] - for u, v, p in preds: + + # print(f'G = {G.edgelist.edgelist_df}') + + df = cugraph.sorensen(G, pairs) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) + + if not use_weight: + nx_pairs = list(pairs.to_records(index=False)) + + # print(f'nx_pairs = {len(nx_pairs)}') + + preds = nx.jaccard_coefficient(Gnx, nx_pairs) + # FIXME: Use known correct values of Sorensen for few graphs, # hardcode it and compare to Cugraph Sorensen to get a more robust test # Conversion from Networkx Jaccard to Sorensen # No networkX equivalent - nx_coeff.append((2 * p) / (1 + p)) - df = cugraph.sorensen(G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental sorensen currently only supports unweighted graphs - df_exp = sorensen(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) - assert len(nx_coeff) == len(df) - for i in range(len(df)): - diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i]) - assert diff < 1.0e-6 - - -def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None): + + nx_coeff = list(map(lambda x: (2 * x[2]) / (1 + x[2]), preds)) + + assert len(nx_coeff) == len(df) + for i in range(len(df)): + diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i]) + assert diff < 1.0e-6 + else: + # FIXME: compare results against resultset api + pass + + +def cugraph_call(benchmark_callable, graph_file, use_weight=False, input_df=None): G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=not edgevals) + G = graph_file.get_graph(ignore_weights=not use_weight) # If no vertex_pair is passed as input, 'cugraph.sorensen' will # compute the 'sorensen_similarity' with the two_hop_neighbor of the # entire graph while nx compute with the one_hop_neighbor. For better # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.sorensen' # and pass it as vertex_pair - vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) - vertex_pair = vertex_pair[["first", "second"]] + if isinstance(input_df, cudf.DataFrame): + vertex_pair = input_df.rename( + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} + ) + vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] + else: + vertex_pair = cudf.DataFrame( + columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL], + dtype=G.edgelist.edgelist_df["src"].dtype, + ) # cugraph Sorensen Call df = benchmark_callable(cugraph.sorensen, G, vertex_pair=vertex_pair) - df = df.sort_values(["first", "second"]).reset_index(drop=True) + df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) return ( - df["first"].to_numpy(), - df["second"].to_numpy(), - df["sorensen_coeff"].to_numpy(), + df[VERTEX_PAIR_FIRST_COL].to_numpy(), + df[VERTEX_PAIR_SECOND_COL].to_numpy(), + df[SORENSEN_COEFF_COL].to_numpy(), ) def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] + sources = M[SRC_COL] + destinations = M[DST_COL] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) @@ -110,7 +132,11 @@ def networkx_call(M, benchmark_callable=None): print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) # Networkx Jaccard Call @@ -150,7 +176,6 @@ def read_csv(request): @pytest.mark.sg def test_sorensen(gpubenchmark, read_csv): - M_cu, M, graph_file = read_csv cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu) nx_src, nx_dst, nx_coeff = networkx_call(M) @@ -170,7 +195,6 @@ def test_sorensen(gpubenchmark, read_csv): @pytest.mark.sg def test_nx_sorensen_time(gpubenchmark, read_csv): - _, M, _ = read_csv nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) @@ -183,7 +207,7 @@ def test_sorensen_edgevals(gpubenchmark, graph_file): M = utils.read_csv_for_nx(dataset_path) M_cu = utils.read_csv_file(dataset_path) cu_src, cu_dst, cu_coeff = cugraph_call( - gpubenchmark, netscience, edgevals=True, input_df=M_cu + gpubenchmark, netscience, use_weight=True, input_df=M_cu ) nx_src, nx_dst, nx_coeff = networkx_call(M) @@ -202,10 +226,11 @@ def test_sorensen_edgevals(gpubenchmark, graph_file): @pytest.mark.sg def test_sorensen_two_hop(read_csv): - _, M, graph_file = read_csv - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) + Gnx = nx.from_pandas_edgelist( + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() + ) G = graph_file.get_graph(ignore_weights=True) compare_sorensen_two_hop(G, Gnx) @@ -213,26 +238,28 @@ def test_sorensen_two_hop(read_csv): @pytest.mark.sg def test_sorensen_two_hop_edge_vals(read_csv): - _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) G = graph_file.get_graph() - compare_sorensen_two_hop(G, Gnx, edgevals=True) + compare_sorensen_two_hop(G, Gnx, use_weight=True) @pytest.mark.sg def test_sorensen_multi_column(read_csv): - _, M, _ = read_csv cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) + cu_M["src_0"] = cudf.Series(M[SRC_COL]) + cu_M["dst_0"] = cudf.Series(M[DST_COL]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() @@ -243,50 +270,13 @@ def test_sorensen_multi_column(read_csv): vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] - df_res = cugraph.sorensen(G1, vertex_pair) - df_plc_exp = sorensen(G1, vertex_pair) - - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) - sorensen_res = df_res["sorensen_coeff"].sort_values().reset_index(drop=True) - sorensen_plc_exp = df_plc_exp["sorensen_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(sorensen_res, sorensen_plc_exp) + df_multi_col_res = cugraph.sorensen(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) + df_single_col_res = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) - - -@pytest.mark.sg -def test_weighted_exp_sorensen(): - karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - sorensen(G) - - G = karate.get_graph(ignore_weights=True) - use_weight = True - with pytest.raises(ValueError): - sorensen(G, use_weight=use_weight) - - -@pytest.mark.sg -def test_invalid_datasets_sorensen(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.sorensen(G) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL]) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py index 36a21df46b8..2f1fd743bdb 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py @@ -14,13 +14,9 @@ import gc import pytest -import numpy as np import networkx as nx - -import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal print("Networkx version : {} ".format(nx.__version__)) @@ -33,55 +29,6 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, graph_file): - # Device data - cu_M = graph_file.get_edgelist() - weight_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32) - weights["weight"] = weight_arr - - G = graph_file.get_graph(ignore_weights=True) - - # cugraph Jaccard Call - df = benchmark_callable(cugraph.jaccard_w, G, weights) - - df = df.sort_values(["first", "second"]).reset_index(drop=True) - - return df["jaccard_coeff"] - - -def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] - edges = [] - for i in range(len(sources)): - edges.append((sources[i], destinations[i])) - edges.append((destinations[i], sources[i])) - edges = list(dict.fromkeys(edges)) - edges = sorted(edges) - # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this - # explicitly - print("Format conversion ... ") - - # NetworkX graph - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - # Networkx Jaccard Call - print("Solving... ") - if benchmark_callable is not None: - preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) - else: - preds = nx.jaccard_coefficient(Gnx, edges) - - coeff = [] - for u, v, p in preds: - coeff.append(p) - return coeff - - # ============================================================================= # Pytest Fixtures # ============================================================================= @@ -98,80 +45,8 @@ def read_csv(request): @pytest.mark.sg -def test_wjaccard(gpubenchmark, read_csv): - - M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_nx_wjaccard_time(gpubenchmark, read_csv): - - M, _ = read_csv - networkx_call(M, gpubenchmark) - - -@pytest.mark.sg -def test_wjaccard_multi_column_weights(gpubenchmark, read_csv): - +def test_wjaccard(read_csv): M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_wjaccard_multi_column(read_csv): - - M, _ = read_csv - - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.jaccard_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) - - -@pytest.mark.sg -def test_invalid_datasets_jaccard_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.jaccard_w(G, None) + G = cugraph.Graph() + G = graph_file.get_graph(ignore_weights=False) + cugraph.jaccard_w(G, weights=None) From 3b3c35b601d615953d50077398846a0ec12d260d Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 11:13:04 -0700 Subject: [PATCH 05/16] Use variable for column names --- .../tests/link_prediction/test_jaccard.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index eeaa4fe3bb0..a7227a45f0c 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -24,7 +24,12 @@ from cugraph.testing import utils, UNDIRECTED_DATASETS from cudf.testing import assert_series_equal -# from cugraph import jaccard_coefficient +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +SORENSEN_COEFF_COL = "jaccard_coeff" +EDGE_ATT_COL = "weight" print("Networkx version : {} ".format(nx.__version__)) @@ -49,12 +54,14 @@ def compare_jaccard_two_hop(G, Gnx, use_weight=False): """ pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) df = cugraph.jaccard(G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) if not use_weight: nx_pairs = list(pairs.to_records(index=False)) @@ -65,7 +72,7 @@ def compare_jaccard_two_hop(G, Gnx, use_weight=False): assert len(nx_coeff) == len(df) for i in range(len(df)): - diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) + diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i]) assert diff < 1.0e-6 else: # FIXME: compare results against resultset api @@ -82,11 +89,14 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.jaccard' # and pass it as vertex_pair if isinstance(input_df, cudf.DataFrame): - vertex_pair = input_df.rename(columns={"0": "first", "1": "second"}) - vertex_pair = vertex_pair[["first", "second"]] + vertex_pair = input_df.rename( + columns={"0": VERTEX_PAIR_FIRST_COL, "1": VERTEX_PAIR_SECOND_COL} + ) + vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] else: vertex_pair = cudf.DataFrame( - columns=["first", "second"], dtype=G.edgelist.edgelist_df["src"].dtype + columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL], + dtype=G.edgelist.edgelist_df["src"].dtype, ) # cugraph Jaccard Call @@ -94,12 +104,14 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight ) - df = df.sort_values(["first", "second"]).reset_index(drop=True) + df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) return ( - df["first"].to_numpy(), - df["second"].to_numpy(), - df["jaccard_coeff"].to_numpy(), + df[VERTEX_PAIR_FIRST_COL].to_numpy(), + df[VERTEX_PAIR_SECOND_COL].to_numpy(), + df[SORENSEN_COEFF_COL].to_numpy(), ) @@ -117,7 +129,7 @@ def networkx_call(M, benchmark_callable=None): print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() + M, source="0", target="1", edge_attr=EDGE_ATT_COL, create_using=nx.Graph() ) # Networkx Jaccard Call @@ -253,8 +265,10 @@ def test_jaccard_nx(read_csv): nx_j = nx.jaccard_coefficient(Gnx) nv_js = sorted(nx_j, key=len, reverse=True) - ebunch = M_cu.rename(columns={"0": "first", "1": "second"}) - ebunch = ebunch[["first", "second"]] + ebunch = M_cu.rename( + columns={"0": VERTEX_PAIR_FIRST_COL, "1": VERTEX_PAIR_SECOND_COL} + ) + ebunch = ebunch[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch) assert len(nv_js) > len(cg_j) @@ -289,5 +303,5 @@ def test_jaccard_multi_column(read_csv): # Calculating mismatch actual = df_multi_col_res.sort_values("0_src").reset_index() - expected = df_single_col_res.sort_values("first").reset_index() - assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL]) From c11e064ff1681aee71701f72dec560bf870cca1b Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 11:58:30 -0700 Subject: [PATCH 06/16] Add test coverage for sorensen_w --- .../cugraph/link_prediction/__init__.py | 13 +- .../tests/link_prediction/test_jaccard.py | 76 +++++++--- .../tests/link_prediction/test_sorensen.py | 66 ++++++--- .../tests/link_prediction/test_wsorensen.py | 136 +----------------- 4 files changed, 115 insertions(+), 176 deletions(-) diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index c90bdaacf34..a8517ee7c0f 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -15,17 +15,22 @@ from cugraph.utilities.api_tools import deprecated_warning_wrapper from cugraph.link_prediction.jaccard import jaccard from cugraph.link_prediction.jaccard import jaccard_coefficient + +from cugraph.link_prediction.sorensen import sorensen +from cugraph.link_prediction.sorensen import sorensen_coefficient + from cugraph.link_prediction.overlap import overlap +from cugraph.link_prediction.overlap import overlap_coefficient + +# To be deprecated from cugraph.link_prediction.wjaccard import jaccard_w jaccard_w = deprecated_warning_wrapper(jaccard_w) + from cugraph.link_prediction.woverlap import overlap_w overlap_w = deprecated_warning_wrapper(overlap_w) + from cugraph.link_prediction.wsorensen import sorensen_w sorensen_w = deprecated_warning_wrapper(sorensen_w) -from cugraph.link_prediction.jaccard import jaccard_coefficient -from cugraph.link_prediction.sorensen import sorensen_coefficient -from cugraph.link_prediction.sorensen import sorensen -from cugraph.link_prediction.overlap import overlap_coefficient diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index a7227a45f0c..42304454291 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -30,6 +30,10 @@ VERTEX_PAIR_SECOND_COL = "second" SORENSEN_COEFF_COL = "jaccard_coeff" EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" print("Networkx version : {} ".format(nx.__version__)) @@ -90,7 +94,7 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False # and pass it as vertex_pair if isinstance(input_df, cudf.DataFrame): vertex_pair = input_df.rename( - columns={"0": VERTEX_PAIR_FIRST_COL, "1": VERTEX_PAIR_SECOND_COL} + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} ) vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] else: @@ -116,8 +120,8 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False def networkx_call(M, benchmark_callable=None): - sources = M["0"] - destinations = M["1"] + sources = M[SRC_COL] + destinations = M[DST_COL] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) @@ -129,7 +133,11 @@ def networkx_call(M, benchmark_callable=None): print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( - M, source="0", target="1", edge_attr=EDGE_ATT_COL, create_using=nx.Graph() + M, + source=SRC_COL, + target=DST_COL, + edge_attr=EDGE_ATT_COL, + create_using=nx.Graph(), ) # Networkx Jaccard Call @@ -196,16 +204,25 @@ def test_directed_graph_check(read_csv, use_weight): _, M, _ = read_csv cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 G1 = cugraph.Graph(directed=True) G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] with pytest.raises(ValueError): cugraph.jaccard(G1, vertex_pair, use_weight) @@ -251,7 +268,9 @@ def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight): def test_jaccard_two_hop(read_csv, use_weight): _, M, graph_file = read_csv - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) + Gnx = nx.from_pandas_edgelist( + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() + ) G = graph_file.get_graph(ignore_weights=not use_weight) compare_jaccard_two_hop(G, Gnx, use_weight) @@ -260,13 +279,15 @@ def test_jaccard_two_hop(read_csv, use_weight): @pytest.mark.sg def test_jaccard_nx(read_csv): M_cu, M, _ = read_csv - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) + Gnx = nx.from_pandas_edgelist( + M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() + ) nx_j = nx.jaccard_coefficient(Gnx) nv_js = sorted(nx_j, key=len, reverse=True) ebunch = M_cu.rename( - columns={"0": VERTEX_PAIR_FIRST_COL, "1": VERTEX_PAIR_SECOND_COL} + columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL} ) ebunch = ebunch[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]] cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch) @@ -283,23 +304,36 @@ def test_jaccard_multi_column(read_csv): _, M, _ = read_csv cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] df_multi_col_res = cugraph.jaccard(G1, vertex_pair) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_single_col_res = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL + ) + df_single_col_res = cugraph.jaccard( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch actual = df_multi_col_res.sort_values("0_src").reset_index() diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index bcdd8d9d0a9..3193c8b6985 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -28,6 +28,10 @@ VERTEX_PAIR_SECOND_COL = "second" SORENSEN_COEFF_COL = "sorensen_coeff" EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" print("Networkx version : {} ".format(nx.__version__)) @@ -84,7 +88,7 @@ def compare_sorensen_two_hop(G, Gnx, use_weight=False): pass -def cugraph_call(benchmark_callable, graph_file, use_weight=False, input_df=None): +def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False): G = cugraph.Graph() G = graph_file.get_graph(ignore_weights=not use_weight) @@ -175,9 +179,12 @@ def read_csv(request): @pytest.mark.sg -def test_sorensen(gpubenchmark, read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen(gpubenchmark, read_csv, use_weight): M_cu, M, graph_file = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight + ) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -201,13 +208,14 @@ def test_nx_sorensen_time(gpubenchmark, read_csv): @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) +@pytest.mark.parametrize("use_weight", [False, True]) @pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") -def test_sorensen_edgevals(gpubenchmark, graph_file): +def test_sorensen_edgevals(gpubenchmark, graph_file, use_weight): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) M_cu = utils.read_csv_file(dataset_path) cu_src, cu_dst, cu_coeff = cugraph_call( - gpubenchmark, netscience, use_weight=True, input_df=M_cu + gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight ) nx_src, nx_dst, nx_coeff = networkx_call(M) @@ -225,19 +233,21 @@ def test_sorensen_edgevals(gpubenchmark, graph_file): @pytest.mark.sg -def test_sorensen_two_hop(read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_two_hop(read_csv, use_weight): _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( M, source=SRC_COL, target=DST_COL, create_using=nx.Graph() ) - G = graph_file.get_graph(ignore_weights=True) + G = graph_file.get_graph(ignore_weights=not use_weight) - compare_sorensen_two_hop(G, Gnx) + compare_sorensen_two_hop(G, Gnx, use_weight=use_weight) @pytest.mark.sg -def test_sorensen_two_hop_edge_vals(read_csv): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_two_hop_edge_vals(read_csv, use_weight): _, M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( @@ -248,33 +258,51 @@ def test_sorensen_two_hop_edge_vals(read_csv): create_using=nx.Graph(), ) - G = graph_file.get_graph() + G = graph_file.get_graph(ignore_weights=not use_weight) - compare_sorensen_two_hop(G, Gnx, use_weight=True) + compare_sorensen_two_hop(G, Gnx, use_weight=use_weight) @pytest.mark.sg def test_sorensen_multi_column(read_csv): _, M, _ = read_csv + MULTI_COL_SRC_0_COL = "src_0" + MULTI_COL_DST_0_COL = "dst_0" + MULTI_COL_SRC_1_COL = "src_1" + MULTI_COL_DST_1_COL = "dst_1" + cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M[SRC_COL]) - cu_M["dst_0"] = cudf.Series(M[DST_COL]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] df_multi_col_res = cugraph.sorensen(G1, vertex_pair) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_single_col_res = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL + ) + df_single_col_res = cugraph.sorensen( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch actual = df_multi_col_res.sort_values("0_src").reset_index() diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py index 8d09b3e25b3..e95abfbf517 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py @@ -14,12 +14,9 @@ import gc import pytest -import numpy as np import networkx as nx -import cudf import cugraph -from cudf.testing import assert_series_equal from cugraph.testing import utils, UNDIRECTED_DATASETS @@ -33,59 +30,6 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, graph_file): - # Device data - cu_M = graph_file.get_edgelist() - weight_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32) - weights["weight"] = weight_arr - - G = graph_file.get_graph(ignore_weights=True) - - # cugraph Sorensen Call - df = benchmark_callable(cugraph.sorensen_w, G, weights) - - df = df.sort_values(["first", "second"]).reset_index(drop=True) - - return df["sorensen_coeff"] - - -def networkx_call(M, benchmark_callable=None): - - sources = M["0"] - destinations = M["1"] - edges = [] - for i in range(len(sources)): - edges.append((sources[i], destinations[i])) - edges.append((destinations[i], sources[i])) - edges = list(dict.fromkeys(edges)) - edges = sorted(edges) - # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this - # explicitly - print("Format conversion ... ") - - # NetworkX graph - Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) - # Networkx Jaccard Call - print("Solving... ") - if benchmark_callable is not None: - preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) - else: - preds = nx.jaccard_coefficient(Gnx, edges) - coeff = [] - for u, v, p in preds: - # FIXME: Use known correct values of WSorensen for few graphs, - # hardcode it and compare to Cugraph WSorensen - # to get a more robust test - - # Conversion from Networkx Jaccard to Sorensen - coeff.append((2 * p) / (1 + p)) - return coeff - - # ============================================================================= # Pytest Fixtures # ============================================================================= @@ -102,80 +46,8 @@ def read_csv(request): @pytest.mark.sg -def test_wsorensen(gpubenchmark, read_csv): - +def test_wsorensen(read_csv): M, graph_file = read_csv - - cu_coeff = cugraph_call(gpubenchmark, graph_file) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_nx_wsorensen_time(gpubenchmark, read_csv): - - M, _ = read_csv - networkx_call(M, gpubenchmark) - - -@pytest.mark.sg -def test_wsorensen_multi_column_weights(gpubenchmark, read_csv): - - M, cu_M = read_csv - - cu_coeff = cugraph_call(gpubenchmark, cu_M) - nx_coeff = networkx_call(M) - for i in range(len(cu_coeff)): - diff = abs(nx_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -def test_wsorensen_multi_column(read_csv): - - M, _ = read_csv - - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.sorensen_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) - - -@pytest.mark.sg -def test_invalid_datasets_sorensen_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.sorensen_w(G, None) + G = cugraph.Graph() + G = graph_file.get_graph(ignore_weights=False) + cugraph.sorensen_w(G, weights=None) From cbb414a10a88cdce1fa53bbff61a143815eefa38 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 14:32:42 -0700 Subject: [PATCH 07/16] Update tests for sorensen and overlap, add more test coverage --- .../tests/link_prediction/test_jaccard.py | 60 ++++--- .../tests/link_prediction/test_overlap.py | 148 +++++++++-------- .../tests/link_prediction/test_sorensen.py | 53 +++++- .../tests/link_prediction/test_woverlap.py | 157 +++--------------- 4 files changed, 179 insertions(+), 239 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index 42304454291..f2e1486b2f3 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -28,7 +28,7 @@ DST_COL = "1" VERTEX_PAIR_FIRST_COL = "first" VERTEX_PAIR_SECOND_COL = "second" -SORENSEN_COEFF_COL = "jaccard_coeff" +JACCARD_COEFF_COL = "jaccard_coeff" EDGE_ATT_COL = "weight" MULTI_COL_SRC_0_COL = "src_0" MULTI_COL_DST_0_COL = "dst_0" @@ -76,7 +76,7 @@ def compare_jaccard_two_hop(G, Gnx, use_weight=False): assert len(nx_coeff) == len(df) for i in range(len(df)): - diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i]) + diff = abs(nx_coeff[i] - df[JACCARD_COEFF_COL].iloc[i]) assert diff < 1.0e-6 else: # FIXME: compare results against resultset api @@ -115,7 +115,7 @@ def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False return ( df[VERTEX_PAIR_FIRST_COL].to_numpy(), df[VERTEX_PAIR_SECOND_COL].to_numpy(), - df[SORENSEN_COEFF_COL].to_numpy(), + df[JACCARD_COEFF_COL].to_numpy(), ) @@ -204,25 +204,17 @@ def test_directed_graph_check(read_csv, use_weight): _, M, _ = read_csv cu_M = cudf.DataFrame() - cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) - cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) - cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 - cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph(directed=True) - G1.from_cudf_edgelist( - cu_M, - source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], - destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], - ) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) + + vertex_pair = cu_M[[SRC_COL, DST_COL]] - vertex_pair = cu_M[ - [ - MULTI_COL_SRC_0_COL, - MULTI_COL_SRC_1_COL, - MULTI_COL_DST_0_COL, - MULTI_COL_DST_1_COL, - ] - ] vertex_pair = vertex_pair[:5] with pytest.raises(ValueError): cugraph.jaccard(G1, vertex_pair, use_weight) @@ -300,19 +292,27 @@ def test_jaccard_nx(read_csv): @pytest.mark.sg -def test_jaccard_multi_column(read_csv): - _, M, _ = read_csv +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_jaccard_multi_column(graph_file, use_weight): + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( cu_M, source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) vertex_pair = cu_M[ @@ -329,7 +329,7 @@ def test_jaccard_multi_column(read_csv): G2 = cugraph.Graph() G2.from_cudf_edgelist( - cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight ) df_single_col_res = cugraph.jaccard( G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] @@ -338,4 +338,16 @@ def test_jaccard_multi_column(read_csv): # Calculating mismatch actual = df_multi_col_res.sort_values("0_src").reset_index() expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() - assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL]) + assert_series_equal(actual[JACCARD_COEFF_COL], expected[JACCARD_COEFF_COL]) + + +@pytest.mark.sg +def test_weighted_jaccard(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + with pytest.raises(ValueError): + cugraph.jaccard(G, use_weight=True) + + G = karate.get_graph(ignore_weights=True) + with pytest.raises(ValueError): + cugraph.jaccard(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 586d534cd42..89951586e1f 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -20,8 +20,18 @@ import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS -from cugraph.experimental import overlap as exp_overlap -from cudf.testing import assert_series_equal, assert_frame_equal +from cudf.testing import assert_series_equal + +SRC_COL = "0" +DST_COL = "1" +VERTEX_PAIR_FIRST_COL = "first" +VERTEX_PAIR_SECOND_COL = "second" +OVERLAP_COEFF_COL = "overlap_coeff" +EDGE_ATT_COL = "weight" +MULTI_COL_SRC_0_COL = "src_0" +MULTI_COL_DST_0_COL = "dst_0" +MULTI_COL_SRC_1_COL = "src_1" +MULTI_COL_DST_1_COL = "dst_1" # ============================================================================= @@ -35,7 +45,6 @@ def setup_function(): # Helper functions # ============================================================================= def compare_overlap(cu_coeff, cpu_coeff): - assert len(cu_coeff) == len(cpu_coeff) for i in range(len(cu_coeff)): if np.isnan(cpu_coeff[i]): @@ -47,21 +56,18 @@ def compare_overlap(cu_coeff, cpu_coeff): assert diff < 1.0e-6 -def cugraph_call(benchmark_callable, graph_file, pairs, edgevals=False): +def cugraph_call(benchmark_callable, graph_file, pairs, use_weight=False): # Device data G = graph_file.get_graph( - create_using=cugraph.Graph(directed=False), ignore_weights=not edgevals + create_using=cugraph.Graph(directed=False), ignore_weights=not use_weight ) # cugraph Overlap Call df = benchmark_callable(cugraph.overlap, G, pairs) - df = df.sort_values(by=["first", "second"]).reset_index(drop=True) - if not edgevals: - # experimental overlap currently only supports unweighted graphs - df_exp = exp_overlap(G, pairs) - df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True) - assert_frame_equal(df, df_exp, check_dtype=False, check_like=True) + df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( + drop=True + ) - return df["overlap_coeff"].to_numpy() + return df[OVERLAP_COEFF_COL].to_numpy() def intersection(a, b, M): @@ -120,8 +126,10 @@ def read_csv(request): dataset_path = graph_file.get_path() Mnx = utils.read_csv_for_nx(dataset_path) - N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 - M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) + N = max(max(Mnx[SRC_COL]), max(Mnx[DST_COL])) + 1 + M = scipy.sparse.csr_matrix( + (Mnx.weight, (Mnx[SRC_COL], Mnx[DST_COL])), shape=(N, N) + ) return M, graph_file @@ -135,7 +143,7 @@ def extract_two_hop(read_csv): G = graph_file.get_graph(ignore_weights=True) pairs = ( G.get_two_hop_neighbors() - .sort_values(["first", "second"]) + .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]) .reset_index(drop=True) ) @@ -144,93 +152,95 @@ def extract_two_hop(read_csv): # Test @pytest.mark.sg -def test_overlap(gpubenchmark, read_csv, extract_two_hop): - +@pytest.mark.parametrize("use_weight", [False, True]) +def test_overlap(gpubenchmark, read_csv, extract_two_hop, use_weight): M, graph_file = read_csv pairs = extract_two_hop - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) + cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, use_weight=use_weight) + cpu_coeff = cpu_call(M, pairs[VERTEX_PAIR_FIRST_COL], pairs[VERTEX_PAIR_SECOND_COL]) compare_overlap(cu_coeff, cpu_coeff) -# Test @pytest.mark.sg -def test_overlap_edge_vals(gpubenchmark, read_csv, extract_two_hop): +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(graph_file, use_weight): + M = utils.read_csv_for_nx(graph_file.get_path()) + cu_M = cudf.DataFrame() + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) - M, graph_file = read_csv - pairs = extract_two_hop + G1 = cugraph.Graph(directed=True) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, edgevals=True) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) + vertex_pair = cu_M[[SRC_COL, DST_COL]] - compare_overlap(cu_coeff, cpu_coeff) + vertex_pair = vertex_pair[:5] + with pytest.raises(ValueError): + cugraph.overlap(G1, vertex_pair, use_weight) @pytest.mark.sg @pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_overlap_multi_column(graph_file): +@pytest.mark.parametrize("use_weight", [False, True]) +def test_overlap_multi_column(graph_file, use_weight): dataset_path = graph_file.get_path() M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 + cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) + cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) + cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 + cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] + cu_M, + source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], + destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] + vertex_pair = cu_M[ + [ + MULTI_COL_SRC_0_COL, + MULTI_COL_SRC_1_COL, + MULTI_COL_DST_0_COL, + MULTI_COL_DST_1_COL, + ] + ] vertex_pair = vertex_pair[:5] - df_res = cugraph.overlap(G1, vertex_pair) - df_plc_exp = exp_overlap(G1, vertex_pair) - - df_plc_exp = df_plc_exp.rename( - columns={ - "0_src": "0_source", - "0_dst": "0_destination", - "1_src": "1_source", - "1_dst": "1_destination", - } - ) - overlap_res = df_res["overlap_coeff"].sort_values().reset_index(drop=True) - overlap_plc_exp = df_plc_exp["overlap_coeff"].sort_values().reset_index(drop=True) - assert_series_equal(overlap_res, overlap_plc_exp) - + df_multi_col_res = cugraph.overlap(G1, vertex_pair, use_weight=use_weight) G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]]) + G2.from_cudf_edgelist( + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight + ) + df_single_col_res = cugraph.overlap( + G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] + ) # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) + actual = df_multi_col_res.sort_values("0_src").reset_index() + expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() + assert_series_equal(actual[OVERLAP_COEFF_COL], expected[OVERLAP_COEFF_COL]) @pytest.mark.sg -def test_weighted_exp_overlap(): +def test_weighted_overlap(): karate = UNDIRECTED_DATASETS[0] - G = karate.get_graph() - with pytest.raises(ValueError): - exp_overlap(G) - G = karate.get_graph(ignore_weights=True) - use_weight = True with pytest.raises(ValueError): - exp_overlap(G, use_weight=use_weight) - + cugraph.overlap(G, use_weight=True) -@pytest.mark.sg -def test_invalid_datasets_overlap(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") + G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): - cugraph.overlap(G) + cugraph.overlap(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 3193c8b6985..0175201a086 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -206,6 +206,28 @@ def test_nx_sorensen_time(gpubenchmark, read_csv): nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) +@pytest.mark.sg +@pytest.mark.parametrize("use_weight", [False, True]) +def test_directed_graph_check(read_csv, use_weight): + _, M, _ = read_csv + + cu_M = cudf.DataFrame() + cu_M[SRC_COL] = cudf.Series(M[SRC_COL]) + cu_M[DST_COL] = cudf.Series(M[DST_COL]) + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + + G1 = cugraph.Graph(directed=True) + weight = EDGE_ATT_COL if use_weight else None + G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight) + + vertex_pair = cu_M[[SRC_COL, DST_COL]] + + vertex_pair = vertex_pair[:5] + with pytest.raises(ValueError): + cugraph.sorensen(G1, vertex_pair, use_weight) + + @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) @pytest.mark.parametrize("use_weight", [False, True]) @@ -264,24 +286,27 @@ def test_sorensen_two_hop_edge_vals(read_csv, use_weight): @pytest.mark.sg -def test_sorensen_multi_column(read_csv): - _, M, _ = read_csv - - MULTI_COL_SRC_0_COL = "src_0" - MULTI_COL_DST_0_COL = "dst_0" - MULTI_COL_SRC_1_COL = "src_1" - MULTI_COL_DST_1_COL = "dst_1" +@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) +@pytest.mark.parametrize("use_weight", [False, True]) +def test_sorensen_multi_column(graph_file, use_weight): + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL]) cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL]) cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000 cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000 + if use_weight: + cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL]) + G1 = cugraph.Graph() + weight = EDGE_ATT_COL if use_weight else None G1.from_cudf_edgelist( cu_M, source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL], destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL], + weight=weight, ) vertex_pair = cu_M[ @@ -298,7 +323,7 @@ def test_sorensen_multi_column(read_csv): G2 = cugraph.Graph() G2.from_cudf_edgelist( - cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL + cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight ) df_single_col_res = cugraph.sorensen( G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]] @@ -308,3 +333,15 @@ def test_sorensen_multi_column(read_csv): actual = df_multi_col_res.sort_values("0_src").reset_index() expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index() assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL]) + + +@pytest.mark.sg +def test_weighted_sorensen(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + with pytest.raises(ValueError): + cugraph.sorensen(G, use_weight=True) + + G = karate.get_graph(ignore_weights=True) + with pytest.raises(ValueError): + cugraph.sorensen(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py index 1dffb9fca41..247ea9ed905 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py @@ -14,15 +14,14 @@ import gc import pytest -import scipy -import numpy as np - -import cudf +import networkx as nx import cugraph -from cudf.testing import assert_series_equal from cugraph.testing import utils, UNDIRECTED_DATASETS +print("Networkx version : {} ".format(nx.__version__)) + + # ============================================================================= # Pytest Setup / Teardown - called for each test function # ============================================================================= @@ -30,142 +29,24 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, graph_file, pairs): - # Device data - cu_M = graph_file.get_edgelist() - weights_arr = cudf.Series( - np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) - ) - weights = cudf.DataFrame() - weights["vertex"] = np.arange(len(weights_arr), dtype=np.int32) - weights["weight"] = weights_arr - - G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) - - # cugraph Overlap Call - df = benchmark_callable(cugraph.overlap_w, G, weights, pairs) - - df = df.sort_values(by=["first", "second"]) - return df["overlap_coeff"].to_numpy() - - -def intersection(a, b, M): - count = 0 - a_idx = M.indptr[a] - b_idx = M.indptr[b] - - while (a_idx < M.indptr[a + 1]) and (b_idx < M.indptr[b + 1]): - a_vertex = M.indices[a_idx] - b_vertex = M.indices[b_idx] - - if a_vertex == b_vertex: - count += 1 - a_idx += 1 - b_idx += 1 - elif a_vertex < b_vertex: - a_idx += 1 - else: - b_idx += 1 - - return count - - -def degree(a, M): - return M.indptr[a + 1] - M.indptr[a] - - -def overlap(a, b, M): - b_sum = degree(b, M) - if b_sum == 0: - return float("NaN") - - i = intersection(a, b, M) - a_sum = degree(a, M) - total = min(a_sum, b_sum) - return i / total - - -def cpu_call(M, first, second): - result = [] - for i in range(len(first)): - result.append(overlap(first[i], second[i], M)) - return result - - -@pytest.mark.sg -@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_woverlap(gpubenchmark, graph_file): - dataset_path = graph_file.get_path() - Mnx = utils.read_csv_for_nx(dataset_path) - N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 - M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) - - G = graph_file.get_graph(ignore_weights=True) - pairs = ( - G.get_two_hop_neighbors() - .sort_values(["first", "second"]) - .reset_index(drop=True) - ) - - cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) - cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) - assert len(cu_coeff) == len(cpu_coeff) - for i in range(len(cu_coeff)): - if np.isnan(cpu_coeff[i]): - assert np.isnan(cu_coeff[i]) - elif np.isnan(cu_coeff[i]): - assert cpu_coeff[i] == cu_coeff[i] - else: - diff = abs(cpu_coeff[i] - cu_coeff[i]) - assert diff < 1.0e-6 - - -@pytest.mark.sg -@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS) -def test_woverlap_multi_column(graph_file): +# ============================================================================= +# Pytest Fixtures +# ============================================================================= +@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) +def read_csv(request): + """ + Read csv file for both networkx and cugraph + """ + graph_file = request.param dataset_path = graph_file.get_path() M = utils.read_csv_for_nx(dataset_path) - cu_M = cudf.DataFrame() - cu_M["src_0"] = cudf.Series(M["0"]) - cu_M["dst_0"] = cudf.Series(M["1"]) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - G1 = cugraph.Graph() - G1.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"] - ) - - G2 = cugraph.Graph() - G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") - - vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] - vertex_pair = vertex_pair[:5] - - weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) - - weights = cudf.DataFrame() - weights["vertex"] = G2.nodes() - weights["vertex_"] = weights["vertex"] + 1000 - weights["weight"] = weight_arr - - df_res = cugraph.overlap_w(G1, weights, vertex_pair) - - weights = weights[["vertex", "weight"]] - df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) - - # Calculating mismatch - actual = df_res.sort_values("0_first").reset_index() - expected = df_exp.sort_values("first").reset_index() - assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) + return M, graph_file @pytest.mark.sg -def test_invalid_datasets_overlap_w(): - karate = UNDIRECTED_DATASETS[0] - df = karate.get_edgelist() - df = df.add(1) - G = cugraph.Graph(directed=False) - G.from_cudf_edgelist(df, source="src", destination="dst") - with pytest.raises(ValueError): - cugraph.overlap_w(G, None) +def test_wjaccard(read_csv): + M, graph_file = read_csv + G = cugraph.Graph() + G = graph_file.get_graph(ignore_weights=False) + cugraph.sorensen_w(G, weights=None) From 0f15b974d0241a38ec98234dc7e21a464cc04f26 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 14:37:20 -0700 Subject: [PATCH 08/16] Change test names --- python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py | 2 +- python/cugraph/cugraph/tests/link_prediction/test_woverlap.py | 4 ++-- .../cugraph/cugraph/tests/link_prediction/test_wsorensen.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py index 2f1fd743bdb..ce86d03367d 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py @@ -45,7 +45,7 @@ def read_csv(request): @pytest.mark.sg -def test_wjaccard(read_csv): +def test_jaccard_w(read_csv): M, graph_file = read_csv G = cugraph.Graph() G = graph_file.get_graph(ignore_weights=False) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py index 247ea9ed905..06479eb0b14 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py @@ -45,8 +45,8 @@ def read_csv(request): @pytest.mark.sg -def test_wjaccard(read_csv): +def test_overlap_w(read_csv): M, graph_file = read_csv G = cugraph.Graph() G = graph_file.get_graph(ignore_weights=False) - cugraph.sorensen_w(G, weights=None) + cugraph.overlap_w(G, weights=None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py index e95abfbf517..f331f9c9b77 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py @@ -46,7 +46,7 @@ def read_csv(request): @pytest.mark.sg -def test_wsorensen(read_csv): +def test_sorensen_w(read_csv): M, graph_file = read_csv G = cugraph.Graph() G = graph_file.get_graph(ignore_weights=False) From 8cd7866490ff0493736e08af159c9be3fc7f83e3 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 15:19:44 -0700 Subject: [PATCH 09/16] Update doc strings, keep APIs backward compatible --- .../cugraph/link_prediction/jaccard.py | 28 ++++++--- .../cugraph/link_prediction/overlap.py | 60 ++++++++++++------ .../cugraph/link_prediction/sorensen.py | 61 +++++++++++++------ 3 files changed, 102 insertions(+), 47 deletions(-) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 4b628400dc6..4b7502cd626 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -113,8 +113,14 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= current implementation computes the jaccard coefficient for all adjacent vertices in the graph. + do_expensive_check : bool, optional (default=False) + Deprecated, no longer needed + use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be wighted if 'use_weight=True'. + Returns ------- @@ -141,13 +147,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= >>> df = jaccard(input_graph) """ - - print("use_weight =", use_weight) - - warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, - ) + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") @@ -219,7 +223,7 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) - Currently not supported + Deprecated, longer needed Returns ------- @@ -245,6 +249,12 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): >>> df = cugraph.jaccard_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index 231906951a0..cafc86becb5 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -28,7 +28,6 @@ # FIXME: Move this function to the utility module so that it can be # shared by other algos def ensure_valid_dtype(input_graph, vertex_pair): - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] vertex_pair_dtypes = vertex_pair.dtypes @@ -44,7 +43,7 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -def overlap_coefficient(G, ebunch=None, use_weight=False): +def overlap_coefficient(G, ebunch=None, do_expensive_check=False): """ For NetworkX Compatability. See `overlap` @@ -64,8 +63,8 @@ def overlap_coefficient(G, ebunch=None, use_weight=False): current implementation computes the overlap coefficient for all adjacent vertices in the graph. - use_weight : bool, optional (default=False) - Currently not supported + do_expensive_check : bool, optional (default=False) + Deprecated, no longer needed Returns ------- @@ -91,6 +90,11 @@ def overlap_coefficient(G, ebunch=None, use_weight=False): >>> G = karate.get_graph(download=True, ignore_weights=True) >>> df = overlap_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) @@ -109,12 +113,14 @@ def overlap_coefficient(G, ebunch=None, use_weight=False): return df -# FIXME: + +# FIXME: # 1. Be consistent with column names for output/result DataFrame # 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs # 3. We need to add support for multi-column vertices -def overlap(G, vertex_pair=None, use_weight=False): + +def overlap(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -134,7 +140,7 @@ def overlap(G, vertex_pair=None, use_weight=False): Parameters ---------- - G : cugraph.Graph + input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information as an edge list (edge weights are not supported yet for this algorithm). The adjacency list will be computed if not already present. @@ -146,8 +152,15 @@ def overlap(G, vertex_pair=None, use_weight=False): vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. + do_expensive_check : bool, optional (default=False) + Deprecated, no longer needed + use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be wighted if 'use_weight=True'. + + Returns ------- @@ -170,24 +183,28 @@ def overlap(G, vertex_pair=None, use_weight=False): -------- >>> from cugraph.datasets import karate >>> from cugraph import overlap - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = overlap(G) + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = overlap(input_graph) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) - if G.is_directed(): + if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") - if vertex_pair is None: # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() + vertex_pair = input_graph.get_two_hop_neighbors() v_p_num_col = len(vertex_pair.columns) if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) src_col_name = vertex_pair.columns[0] dst_col_name = vertex_pair.columns[1] first = vertex_pair[src_col_name] @@ -196,19 +213,22 @@ def overlap(G, vertex_pair=None, use_weight=False): elif vertex_pair is not None: raise ValueError("vertex_pair must be a cudf dataframe") - first, second, overlap_coeff = pylibcugraph_overlap_coefficients( resource_handle=ResourceHandle(), - graph=G._plc_graph, + graph=input_graph._plc_graph, first=first, second=second, use_weight=use_weight, do_expensive_check=False, ) - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) if v_p_num_col == 2: # single column vertex diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index e1cc0d64747..8f7f9012fcd 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -27,7 +27,6 @@ # FIXME: Move this function to the utility module so that it can be # shared by other algos def ensure_valid_dtype(input_graph, vertex_pair): - vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0] vertex_pair_dtypes = vertex_pair.dtypes @@ -42,12 +41,14 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -# FIXME: + +# FIXME: # 1. Be consistent with column names for output/result DataFrame # 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs # 3. We need to add support for multi-column vertices -def sorensen(G, vertex_pair=None, use_weight=False): + +def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -64,7 +65,7 @@ def sorensen(G, vertex_pair=None, use_weight=False): Parameters ---------- - G : cugraph.Graph + input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information as an edge list (edge weights are not supported yet for this algorithm). The graph should be undirected where an undirected edge is represented by a @@ -80,8 +81,15 @@ def sorensen(G, vertex_pair=None, use_weight=False): current implementation computes the Sorensen coefficient for all adjacent vertices in the graph. + do_expensive_check : bool, optional (default=False) + Deprecated, no longer needed + use_weight : bool, optional (default=False) - Currently not supported + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be wighted if 'use_weight=True'. + + Returns ------- @@ -104,22 +112,28 @@ def sorensen(G, vertex_pair=None, use_weight=False): -------- >>> from cugraph.datasets import karate >>> from cugraph import sorensen - >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = sorensen(G) + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = sorensen(input_graph) """ - if G.is_directed(): + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) + + if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") if vertex_pair is None: # Call two_hop neighbor of the entire graph - vertex_pair = G.get_two_hop_neighbors() + vertex_pair = input_graph.get_two_hop_neighbors() v_p_num_col = len(vertex_pair.columns) if isinstance(vertex_pair, cudf.DataFrame): - vertex_pair = renumber_vertex_pair(G, vertex_pair) - vertex_pair = ensure_valid_dtype(G, vertex_pair) + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) src_col_name = vertex_pair.columns[0] dst_col_name = vertex_pair.columns[1] first = vertex_pair[src_col_name] @@ -130,16 +144,20 @@ def sorensen(G, vertex_pair=None, use_weight=False): first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients( resource_handle=ResourceHandle(), - graph=G._plc_graph, + graph=input_graph._plc_graph, first=first, second=second, use_weight=use_weight, do_expensive_check=False, ) - if G.renumbered: - vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True) - vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True) + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) if v_p_num_col == 2: # single column vertex @@ -153,7 +171,7 @@ def sorensen(G, vertex_pair=None, use_weight=False): return df -def sorensen_coefficient(G, ebunch=None, use_weight=False): +def sorensen_coefficient(G, ebunch=None, do_expensive_check=False): """ For NetworkX Compatability. See `sorensen` @@ -171,8 +189,8 @@ def sorensen_coefficient(G, ebunch=None, use_weight=False): given vertex pairs. If the vertex_pair is not provided then the current implementation computes the sorensen coefficient for all adjacent vertices in the graph. - use_weight : bool, optional (default=False) - Currently not supported + do_expensive_check : bool, optional (default=False) + Deprecated, no longer needed Returns ------- @@ -199,6 +217,13 @@ def sorensen_coefficient(G, ebunch=None, use_weight=False): >>> df = sorensen_coef(G) """ + + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since it is no longer needed", + DeprecationWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) From a0c4bad0eac9a16ee29ded6d5e57bf57f8373049 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 15:36:39 -0700 Subject: [PATCH 10/16] checkout pre-commit-config.yaml from 23.10 --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3cc848762f7..0f05aedf1a1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,14 +11,14 @@ repos: - id: debug-statements - id: mixed-line-ending - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 22.10.0 hooks: - id: black language_version: python3 args: [--target-version=py38] files: ^python/ - repo: https://github.com/PyCQA/flake8 - rev: 6.1.0 + rev: 6.0.0 hooks: - id: flake8 args: ["--config=.flake8"] @@ -27,13 +27,13 @@ repos: types_or: [python] # TODO: Enable [python, cython] additional_dependencies: ["flake8-force"] - repo: https://github.com/asottile/yesqa - rev: v1.5.0 + rev: v1.3.0 hooks: - id: yesqa additional_dependencies: - flake8==6.0.0 - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v16.0.6 + rev: v16.0.1 hooks: - id: clang-format exclude: | @@ -52,7 +52,7 @@ repos: pass_filenames: false additional_dependencies: [gitpython] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.5.2 + rev: v1.5.1 hooks: - id: rapids-dependency-file-generator args: ["--clean"] From 555e632943b9349df13d9d629f5985dde89cf43f Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 28 Aug 2023 15:49:11 -0700 Subject: [PATCH 11/16] stlye fix --- .../cugraph/cugraph/link_prediction/wjaccard.py | 15 +++------------ .../cugraph/cugraph/link_prediction/woverlap.py | 5 ++--- .../cugraph/cugraph/link_prediction/wsorensen.py | 7 +++---- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index 1f623cd4dd1..f139486c1ee 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -11,17 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph_classes import Graph -# from cugraph.link_prediction import jaccard_wrapper from cugraph.link_prediction import jaccard -import cudf - -from pylibcugraph import ( - jaccard_coefficients as pylibcugraph_jaccard_coefficients, -) -from pylibcugraph import ResourceHandle import warnings + # FIXME: Move this function to the utility module so that it can be # shared by other algos def ensure_valid_dtype(input_graph, vertex_pair): @@ -41,8 +34,6 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair - - def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): """ Compute the weighted Jaccard similarity between each pair of vertices @@ -124,6 +115,6 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): warning_msg = ( "jaccard_w is deprecated. To compute weighted jaccard, please use " "jaccard(input_graph, vertex_pair=False, use_weight=True)" - ) + ) warnings.warn(warning_msg, DeprecationWarning) - return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True) \ No newline at end of file + return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index 4a400c6b126..2acfbebafe1 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -11,11 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# from cugraph.link_prediction import overlap_wrapper from cugraph.link_prediction import overlap -import cudf import warnings + def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ Compute the weighted Overlap Coefficient between each pair of vertices @@ -99,6 +98,6 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): warning_msg = ( " overlap_w is deprecated. To compute weighted overlap, please use " "overlap(input_graph, vertex_pair=False, use_weight=True)" - ) + ) warnings.warn(warning_msg, DeprecationWarning) return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index a9d3acf15b2..5e23d293150 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -11,11 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph_classes import Graph from cugraph.link_prediction import sorensen -import cudf import warnings + def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ Compute the weighted Sorensen similarity between each pair of vertices @@ -93,8 +92,8 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ warning_msg = ( - f" sorensen_w is deprecated. To compute weighted sorensen, please use " + "sorensen_w is deprecated. To compute weighted sorensen, please use " "sorensen(input_graph, vertex_pair=False, use_weight=True)" - ) + ) warnings.warn(warning_msg, DeprecationWarning) return sorensen(input_graph, vertex_pair, use_weight=True) From 4ca2b0ca281da822955a2146a531986f293485a7 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Tue, 29 Aug 2023 17:50:03 -0700 Subject: [PATCH 12/16] Update MG similarity tests --- cpp/tests/c_api/mg_similarity_test.c | 51 +++++++++++++++------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 0ac160245ab..336f6c50519 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -160,15 +160,16 @@ int test_jaccard(const cugraph_resource_handle_t* handle) int test_weighted_jaccard(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(handle, h_src, @@ -216,15 +217,16 @@ int test_sorensen(const cugraph_resource_handle_t* handle) int test_weighted_sorensen(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(handle, h_src, @@ -272,15 +274,16 @@ int test_overlap(const cugraph_resource_handle_t* handle) int test_weighted_overlap(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_vertices = 7; + size_t num_pairs = 3; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; - weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // TODO: Fill in + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(handle, h_src, From a7862d8dfe1834b732c786bb684cfc0e8dd6cfad Mon Sep 17 00:00:00 2001 From: Md Naim Date: Wed, 30 Aug 2023 15:40:44 -0700 Subject: [PATCH 13/16] Fix doc test errors --- python/cugraph/cugraph/link_prediction/jaccard.py | 3 ++- python/cugraph/cugraph/link_prediction/sorensen.py | 2 +- python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 4b7502cd626..d59f96d76f2 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -245,8 +245,9 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): Examples -------- >>> from cugraph.datasets import karate + >>> from cugraph import jaccard_coefficient >>> G = karate.get_graph(download=True) - >>> df = cugraph.jaccard_coefficient(G) + >>> df = jaccard_coefficient(G) """ if do_expensive_check: diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 8f7f9012fcd..17bc0f28890 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -214,7 +214,7 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=False): >>> from cugraph.datasets import karate >>> from cugraph import sorensen_coefficient >>> G = karate.get_graph(download=True, ignore_weights=True) - >>> df = sorensen_coef(G) + >>> df = sorensen_coefficient(G) """ diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx index 7c672153cf4..c36ae163d31 100644 --- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx @@ -132,7 +132,6 @@ def jaccard_coefficients(ResourceHandle resource_handle, &error_ptr) assert_success(error_code, error_ptr, "vertex_pairs") - print('cugraph_jaccard_coefficients use_weight:', use_weight) error_code = cugraph_jaccard_coefficients(c_resource_handle_ptr, c_graph_ptr, vertex_pairs_ptr, From 09f1ac06abf6e6b25bf37df3b0823f75f2a687ee Mon Sep 17 00:00:00 2001 From: Md Naim Date: Mon, 18 Sep 2023 17:51:39 -0700 Subject: [PATCH 14/16] Address PR comments - update doc strings and tests, remove a few test files --- .../cugraph/cugraph/experimental/__init__.py | 16 ++++ .../cugraph/link_prediction/jaccard.py | 87 ++++++++++------- .../cugraph/link_prediction/overlap.py | 86 ++++++++++------- .../cugraph/link_prediction/sorensen.py | 94 +++++++++++-------- .../cugraph/link_prediction/wjaccard.py | 29 +++++- .../cugraph/link_prediction/woverlap.py | 28 +++++- .../cugraph/link_prediction/wsorensen.py | 28 +++++- .../tests/link_prediction/test_jaccard.py | 22 +++-- .../tests/link_prediction/test_overlap.py | 8 +- .../tests/link_prediction/test_sorensen.py | 13 ++- .../tests/link_prediction/test_wjaccard.py | 52 ---------- .../tests/link_prediction/test_woverlap.py | 52 ---------- .../tests/link_prediction/test_wsorensen.py | 53 ----------- .../pylibcugraph/experimental/__init__.py | 1 - .../pylibcugraph/jaccard_coefficients.pyx | 3 + .../pylibcugraph/overlap_coefficients.pyx | 6 +- .../pylibcugraph/sorensen_coefficients.pyx | 6 +- 17 files changed, 280 insertions(+), 304 deletions(-) delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_woverlap.py delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index 4140f5bdf02..2309a529047 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -51,3 +51,19 @@ from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler) + + +from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient + +jaccard = promoted_experimental_warning_wrapper(jaccard) +jaccard_coefficient = promoted_experimental_warning_wrapper(jaccard_coefficient) + +from cugraph.link_prediction.sorensen import sorensen, sorensen_coefficient + +sorensen = promoted_experimental_warning_wrapper(sorensen) +sorensen_coefficient = promoted_experimental_warning_wrapper(sorensen_coefficient) + +from cugraph.link_prediction.overlap import overlap, overlap_coefficient + +overlap = promoted_experimental_warning_wrapper(overlap) +overlap_coefficient = promoted_experimental_warning_wrapper(overlap_coefficient) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index d59f96d76f2..e3d1f822fcb 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -18,12 +18,23 @@ ) import cudf import warnings +from typing import Union, Iterable from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, ) from pylibcugraph import ResourceHandle +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + # FIXME: Move this function to the utility module so that it can be # shared by other algos @@ -43,13 +54,12 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -# FIXME: -# 1. Be consistent with column names for output/result DataFrame -# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs -# 3. We need to add support for multi-column vertices - - -def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): +def jaccard( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -99,12 +109,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -114,12 +123,15 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) - Deprecated, no longer needed + Deprecated. + Originally, when set to Ture, jaccard implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted jaccard (if use_weight==True) or un-weighted jaccard (if use_weight==False). - 'input_graph' must be wighted if 'use_weight=True'. + 'input_graph' must be weighted if 'use_weight=True'. Returns @@ -149,8 +161,9 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= """ if do_expensive_check: warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, ) if input_graph.is_directed(): @@ -202,28 +215,38 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= return df -def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): +def jaccard_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ For NetworkX Compatability. See `jaccard` Parameters ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. - ebunch : cudf.DataFrame, optional (default=None) + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the jaccard coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the jaccard coefficient for all - adjacent vertices in the graph. + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) - Deprecated, longer needed + Deprecated. + Originally, when set to Ture, jaccard implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -250,12 +273,6 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=False): >>> df = jaccard_coefficient(G) """ - if do_expensive_check: - warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, - ) - vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index cafc86becb5..ddc62e068f6 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -18,12 +18,23 @@ ) import cudf import warnings +from typing import Union, Iterable from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, ) from pylibcugraph import ResourceHandle +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + # FIXME: Move this function to the utility module so that it can be # shared by other algos @@ -43,28 +54,38 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -def overlap_coefficient(G, ebunch=None, do_expensive_check=False): +def overlap_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ - For NetworkX Compatability. See `overlap` + Compute overlap coefficient. Parameters ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. - ebunch : cudf.DataFrame, optional (default=None) + This implementation only supports undirected, non-multi edge Graph. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the Overlap coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the overlap coefficient for all - adjacent vertices in the graph. + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) - Deprecated, no longer needed + Deprecated. + Originally, when set to Ture, overlap implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -90,11 +111,6 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=False): >>> G = karate.get_graph(download=True, ignore_weights=True) >>> df = overlap_coefficient(G) """ - if do_expensive_check: - warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, - ) vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) @@ -114,13 +130,12 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=False): return df -# FIXME: -# 1. Be consistent with column names for output/result DataFrame -# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs -# 3. We need to add support for multi-column vertices - - -def overlap(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): +def overlap( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -142,23 +157,25 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - adjacency list will be computed if not already present. - - This implementation only supports undirected, unweighted Graph. + as an edge list. The adjacency list will be computed if not already + present. + This implementation only supports undirected, non-multi edge Graph. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. do_expensive_check : bool, optional (default=False) - Deprecated, no longer needed + Deprecated. + Originally, when set to Ture, overlap implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted overlap (if use_weight==True) or un-weighted overlap (if use_weight==False). - 'input_graph' must be wighted if 'use_weight=True'. + 'input_graph' must be weighted if 'use_weight=True'. @@ -189,8 +206,9 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=False, use_weight= """ if do_expensive_check: warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, ) if input_graph.is_directed(): diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 17bc0f28890..2331309beec 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -18,11 +18,23 @@ ) import cudf import warnings +from typing import Union, Iterable + from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, ) from pylibcugraph import ResourceHandle +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + # FIXME: Move this function to the utility module so that it can be # shared by other algos @@ -42,13 +54,12 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -# FIXME: -# 1. Be consistent with column names for output/result DataFrame -# 2. Enforce that 'vertex_pair' is a cudf Dataframe with only columns for vertex pairs -# 3. We need to add support for multi-column vertices - - -def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight=False): +def sorensen( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated + use_weight: bool = False, +): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -67,12 +78,10 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight ---------- input_graph : cugraph.Graph cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not supported yet for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. + as an edge list. The adjacency list will be computed if not already + present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi edge Graph. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of @@ -82,14 +91,15 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) - Deprecated, no longer needed + Deprecated. + Originally, when set to Ture, sorensen implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted sorensen (if use_weight==True) or un-weighted sorensen (if use_weight==False). - 'input_graph' must be wighted if 'use_weight=True'. - - + 'input_graph' must be weighted if 'use_weight=True'. Returns ------- @@ -118,8 +128,9 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight """ if do_expensive_check: warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, ) if input_graph.is_directed(): @@ -171,26 +182,38 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=False, use_weight return df -def sorensen_coefficient(G, ebunch=None, do_expensive_check=False): +def sorensen_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, + do_expensive_check: bool = False, # deprecated +): """ - For NetworkX Compatability. See `sorensen` + Compute sorensen coefficient. Parameters ---------- - G : cugraph.Graph - cuGraph Graph instance, should contain the connectivity information - as an edge list (edge weights are not used for this algorithm). The - graph should be undirected where an undirected edge is represented by a - directed edge in both direction. The adjacency list will be computed if - not already present. - ebunch : cudf.DataFrame, optional (default=None) + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) A GPU dataframe consisting of two columns representing pairs of - vertices. If provided, the sorensen coefficient is computed for the - given vertex pairs. If the vertex_pair is not provided then the - current implementation computes the sorensen coefficient for all - adjacent vertices in the graph. + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + do_expensive_check : bool, optional (default=False) - Deprecated, no longer needed + Deprecated. + Originally, when set to Ture, sorensen implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -217,13 +240,6 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=False): >>> df = sorensen_coefficient(G) """ - - if do_expensive_check: - warnings.warn( - "do_expensive_check is deprecated since it is no longer needed", - DeprecationWarning, - ) - vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index f139486c1ee..e8f9a65b930 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -12,8 +12,19 @@ # limitations under the License. from cugraph.link_prediction import jaccard +import cudf import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + # FIXME: Move this function to the utility module so that it can be # shared by other algos @@ -34,7 +45,13 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): +# input_graph, weights, vertex_pair=None, do_expensive_check=False): +def jaccard_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -72,9 +89,11 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): vertices. If provided, the jaccard coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + Originally, when set to Ture, jaccard implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -116,5 +135,5 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=False): "jaccard_w is deprecated. To compute weighted jaccard, please use " "jaccard(input_graph, vertex_pair=False, use_weight=True)" ) - warnings.warn(warning_msg, DeprecationWarning) + warnings.warn(warning_msg, FutureWarning) return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index 2acfbebafe1..38d57488da7 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -12,10 +12,26 @@ # limitations under the License. from cugraph.link_prediction import overlap +import cudf import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional -def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + + +def overlap_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -54,9 +70,11 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + Originally, when set to Ture, overlap implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -99,5 +117,5 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): " overlap_w is deprecated. To compute weighted overlap, please use " "overlap(input_graph, vertex_pair=False, use_weight=True)" ) - warnings.warn(warning_msg, DeprecationWarning) + warnings.warn(warning_msg, FutureWarning) return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True) diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index 5e23d293150..692c41c5bc8 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -12,10 +12,26 @@ # limitations under the License. from cugraph.link_prediction import sorensen +import cudf import warnings +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional -def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): +# FIXME: the networkx.Graph type used in the type annotation for +# induced_subgraph() is specified using a string literal to avoid depending on +# and importing networkx. Instead, networkx is imported optionally, which may +# cause a problem for a type checker if run in an environment where networkx is +# not installed. +networkx = import_optional("networkx") + + +def sorensen_w( + input_graph: Graph, + weights: cudf.DataFrame = None, # deprecated + vertex_pair: cudf.DataFrame = None, + do_expensive_check: bool = False, # deprecated +): """ Compute the weighted Sorensen similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -49,9 +65,11 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): vertices. If provided, the sorensen coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. - do_expensive_check: bool (default=True) - When set to True, check if the vertices in the graph are (re)numbered - from 0 to V-1 where V is the total number of vertices. + do_expensive_check : bool, optional (default=False) + Deprecated. + Originally, when set to Ture, sorensen implementation checked if + the vertices in the graph are (re)numbered from 0 to V-1 where + V is the total number of vertices. Returns ------- @@ -95,5 +113,5 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): "sorensen_w is deprecated. To compute weighted sorensen, please use " "sorensen(input_graph, vertex_pair=False, use_weight=True)" ) - warnings.warn(warning_msg, DeprecationWarning) + warnings.warn(warning_msg, FutureWarning) return sorensen(input_graph, vertex_pair, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index f2e1486b2f3..7ce7d263eda 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -23,6 +23,7 @@ from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -36,9 +37,6 @@ MULTI_COL_DST_1_COL = "dst_1" -print("Networkx version : {} ".format(nx.__version__)) - - # ============================================================================= # Pytest Setup / Teardown - called for each test function # ============================================================================= @@ -194,8 +192,20 @@ def test_jaccard(read_csv, gpubenchmark, use_weight): print("Mismatches: %d" % err) assert err == 0 else: + G = graph_file.get_graph() + res_w_jaccard = cugraph.jaccard_w(G, vertex_pair=M_cu[[SRC_COL, DST_COL]]) + res_w_jaccard = res_w_jaccard.sort_values( + [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL] + ).reset_index(drop=True) + res_jaccard = cudf.DataFrame() + res_jaccard[VERTEX_PAIR_FIRST_COL] = cu_src + res_jaccard[VERTEX_PAIR_SECOND_COL] = cu_dst + res_jaccard[JACCARD_COEFF_COL] = cu_coeff + assert_frame_equal( + res_w_jaccard, res_jaccard, check_dtype=False, check_like=True + ) + # FIXME: compare weighted jaccard results against resultset api - pass @pytest.mark.sg @@ -347,7 +357,3 @@ def test_weighted_jaccard(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.jaccard(G, use_weight=True) - - G = karate.get_graph(ignore_weights=True) - with pytest.raises(ValueError): - cugraph.jaccard(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 89951586e1f..e24deaa61ac 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -66,6 +67,9 @@ def cugraph_call(benchmark_callable, graph_file, pairs, use_weight=False): df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index( drop=True ) + if use_weight: + res_w_overlap = cugraph.overlap_w(G, vertex_pair=pairs) + assert_frame_equal(res_w_overlap, df, check_dtype=False, check_like=True) return df[OVERLAP_COEFF_COL].to_numpy() @@ -240,7 +244,3 @@ def test_weighted_overlap(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.overlap(G, use_weight=True) - - G = karate.get_graph(ignore_weights=True) - with pytest.raises(ValueError): - cugraph.overlap(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 0175201a086..6b4074fce30 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -21,6 +21,7 @@ from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -33,8 +34,6 @@ MULTI_COL_SRC_1_COL = "src_1" MULTI_COL_DST_1_COL = "dst_1" -print("Networkx version : {} ".format(nx.__version__)) - # ============================================================================= # Pytest Setup / Teardown - called for each test function @@ -85,7 +84,11 @@ def compare_sorensen_two_hop(G, Gnx, use_weight=False): assert diff < 1.0e-6 else: # FIXME: compare results against resultset api - pass + res_w_sorensen = cugraph.sorensen_w(G, vertex_pair=pairs) + res_w_sorensen = res_w_sorensen.sort_values( + [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL] + ).reset_index(drop=True) + assert_frame_equal(res_w_sorensen, df, check_dtype=False, check_like=True) def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False): @@ -341,7 +344,3 @@ def test_weighted_sorensen(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.sorensen(G, use_weight=True) - - G = karate.get_graph(ignore_weights=True) - with pytest.raises(ValueError): - cugraph.sorensen(G, use_weight=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py deleted file mode 100644 index ce86d03367d..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import networkx as nx -import cugraph -from cugraph.testing import utils, UNDIRECTED_DATASETS - - -print("Networkx version : {} ".format(nx.__version__)) - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) -def read_csv(request): - """ - Read csv file for both networkx and cugraph - """ - graph_file = request.param - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - return M, graph_file - - -@pytest.mark.sg -def test_jaccard_w(read_csv): - M, graph_file = read_csv - G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=False) - cugraph.jaccard_w(G, weights=None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py deleted file mode 100644 index 06479eb0b14..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import networkx as nx -import cugraph -from cugraph.testing import utils, UNDIRECTED_DATASETS - - -print("Networkx version : {} ".format(nx.__version__)) - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) -def read_csv(request): - """ - Read csv file for both networkx and cugraph - """ - graph_file = request.param - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - return M, graph_file - - -@pytest.mark.sg -def test_overlap_w(read_csv): - M, graph_file = read_csv - G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=False) - cugraph.overlap_w(G, weights=None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py deleted file mode 100644 index f331f9c9b77..00000000000 --- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc - -import pytest -import networkx as nx - -import cugraph -from cugraph.testing import utils, UNDIRECTED_DATASETS - - -print("Networkx version : {} ".format(nx.__version__)) - - -# ============================================================================= -# Pytest Setup / Teardown - called for each test function -# ============================================================================= -def setup_function(): - gc.collect() - - -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS) -def read_csv(request): - """ - Read csv file for both networkx and cugraph - """ - graph_file = request.param - dataset_path = graph_file.get_path() - M = utils.read_csv_for_nx(dataset_path) - - return M, graph_file - - -@pytest.mark.sg -def test_sorensen_w(read_csv): - M, graph_file = read_csv - G = cugraph.Graph() - G = graph_file.get_graph(ignore_weights=False) - cugraph.sorensen_w(G, weights=None) diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index c09899d8eda..6194ace5956 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -74,7 +74,6 @@ from pylibcugraph.node2vec import node2vec -node2vec = promoted_experimental_warning_wrapper(node2vec) # from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx index c36ae163d31..59e94aeb615 100644 --- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx @@ -86,6 +86,9 @@ def jaccard_coefficients(ResourceHandle resource_handle, Destination of the vertex pair. use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx index 3e75abf94f2..28360121c64 100644 --- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx @@ -84,8 +84,10 @@ def overlap_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx index 9dd5d5fd13e..983a635012f 100644 --- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx @@ -83,8 +83,10 @@ def sorensen_coefficients(ResourceHandle resource_handle, second : Destination of the vertex pair. - use_weight : bool, optional (default=False) - Currently not supported + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients do_expensive_check : bool If True, performs more extensive tests on the inputs to ensure From 70c0b5bd06effa1d0905a3597985dd5f6dad70cd Mon Sep 17 00:00:00 2001 From: Md Naim Date: Wed, 20 Sep 2023 11:58:53 -0700 Subject: [PATCH 15/16] Updates a few comments, warnings and doc-strings --- .../cugraph/community/induced_subgraph.py | 9 +++-- .../cugraph/link_prediction/jaccard.py | 33 ++++++++++++------- .../cugraph/link_prediction/overlap.py | 32 +++++++++++------- .../cugraph/link_prediction/sorensen.py | 32 +++++++++++------- .../cugraph/link_prediction/wjaccard.py | 18 +++++----- .../cugraph/link_prediction/woverlap.py | 17 +++++----- .../cugraph/link_prediction/wsorensen.py | 17 +++++----- .../cugraph/cugraph/sampling/random_walks.py | 9 +++-- 8 files changed, 99 insertions(+), 68 deletions(-) diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py index 29fe2f29c1e..3a901199b01 100644 --- a/python/cugraph/cugraph/community/induced_subgraph.py +++ b/python/cugraph/cugraph/community/induced_subgraph.py @@ -25,11 +25,10 @@ ) from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index e3d1f822fcb..27bfa58e6b0 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -28,11 +28,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -124,9 +123,12 @@ def jaccard( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, jaccard implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted jaccard (if use_weight==True) @@ -244,9 +246,11 @@ def jaccard_coefficient( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, jaccard implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -273,6 +277,13 @@ def jaccard_coefficient( >>> df = jaccard_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index ddc62e068f6..3a25526679c 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -28,11 +28,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -83,9 +82,11 @@ def overlap_coefficient( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, overlap implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -111,6 +112,13 @@ def overlap_coefficient( >>> G = karate.get_graph(download=True, ignore_weights=True) >>> df = overlap_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) @@ -168,9 +176,11 @@ def overlap( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, overlap implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted overlap (if use_weight==True) diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 2331309beec..a8ccced1e68 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -28,11 +28,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -92,9 +91,11 @@ def sorensen( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, sorensen implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. use_weight : bool, optional (default=False) Flag to indicate whether to compute weighted sorensen (if use_weight==True) @@ -211,9 +212,11 @@ def sorensen_coefficient( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, sorensen implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- @@ -240,6 +243,13 @@ def sorensen_coefficient( >>> df = sorensen_coefficient(G) """ + if do_expensive_check: + warnings.warn( + "do_expensive_check is deprecated since vertex IDs are no longer " + "required to be consecutively numbered", + FutureWarning, + ) + vertex_pair = None G, isNx = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index e8f9a65b930..ec538bbc0ed 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -18,11 +18,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -45,7 +44,6 @@ def ensure_valid_dtype(input_graph, vertex_pair): return vertex_pair -# input_graph, weights, vertex_pair=None, do_expensive_check=False): def jaccard_w( input_graph: Graph, weights: cudf.DataFrame = None, # deprecated @@ -91,9 +89,11 @@ def jaccard_w( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, jaccard implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index 38d57488da7..5f43ad0670b 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -18,11 +18,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -72,9 +71,11 @@ def overlap_w( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, overlap implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index 692c41c5bc8..ff502b36837 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -18,11 +18,10 @@ from cugraph.structure import Graph from cugraph.utilities.utils import import_optional -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") @@ -67,9 +66,11 @@ def sorensen_w( do_expensive_check : bool, optional (default=False) Deprecated. - Originally, when set to Ture, sorensen implementation checked if - the vertices in the graph are (re)numbered from 0 to V-1 where - V is the total number of vertices. + This option added a check to ensure integer vertex IDs are sequential + values from 0 to V-1. That check is now redundant because cugraph + unconditionally renumbers and un-renumbers integer vertex IDs for + optimal performance, therefore this option is deprecated and will be + removed in a future version. Returns ------- diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index 015c05d1b08..7b04dba82a5 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -25,11 +25,10 @@ from cugraph.utilities.utils import import_optional from typing import Union, Tuple -# FIXME: the networkx.Graph type used in the type annotation for -# induced_subgraph() is specified using a string literal to avoid depending on -# and importing networkx. Instead, networkx is imported optionally, which may -# cause a problem for a type checker if run in an environment where networkx is -# not installed. +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. networkx = import_optional("networkx") From fa609dd890ba812c37aa3aac81cecf82e0edb506 Mon Sep 17 00:00:00 2001 From: Md Naim Date: Tue, 26 Sep 2023 11:26:30 -0700 Subject: [PATCH 16/16] Adds logic to handle isolated vertices at python layer --- python/cugraph/cugraph/community/louvain.py | 44 +++++++++++++++++---- python/cugraph/cugraph/utilities/utils.py | 4 +- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/python/cugraph/cugraph/community/louvain.py b/python/cugraph/cugraph/community/louvain.py index 7f9742c8f09..aa7fd9dbe1c 100644 --- a/python/cugraph/cugraph/community/louvain.py +++ b/python/cugraph/cugraph/community/louvain.py @@ -12,6 +12,7 @@ # limitations under the License. from cugraph.utilities import ( + is_nx_graph_type, ensure_cugraph_obj_for_nx, df_score_to_dictionary, ) @@ -21,6 +22,9 @@ from pylibcugraph import louvain as pylibcugraph_louvain from pylibcugraph import ResourceHandle +VERTEX_COL_NAME = "vertex" +CLUSTER_ID_COL_NAME = "partition" + # FIXME: max_level should default to 100 once max_iter is removed def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): @@ -72,9 +76,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): GPU data frame of size V containing two columns the vertex id and the partition id it is assigned to. - df['vertex'] : cudf.Series + result_df['vertex'] : cudf.Series Contains the vertex identifiers - df['partition'] : cudf.Series + result_df['partition'] : cudf.Series Contains the partition assigned to the vertices modularity_score : float @@ -89,6 +93,14 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): """ + isolated_vertices = list() + if is_nx_graph_type(type(G)): + num_of_vertices = len(G.nodes) + isolated_vertices = [v for v in range(num_of_vertices) if G.degree[v] == 0] + else: + # FIXME: Gather list of isolated vertices for other graph types + pass + G, isNx = ensure_cugraph_obj_for_nx(G) if G.is_directed(): @@ -112,6 +124,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): if max_level is None: max_level = 100 + if max_level > 1000: + max_level = 1000 + vertex, partition, mod_score = pylibcugraph_louvain( resource_handle=ResourceHandle(), graph=G._plc_graph, @@ -121,14 +136,27 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): do_expensive_check=False, ) - df = cudf.DataFrame() - df["vertex"] = vertex - df["partition"] = partition + result_df = cudf.DataFrame() + result_df[VERTEX_COL_NAME] = vertex + result_df[CLUSTER_ID_COL_NAME] = partition + + unique_cids = result_df[CLUSTER_ID_COL_NAME].unique() + max_cluster_id = -1 if len(result_df) == 0 else unique_cids.max() + + if len(isolated_vertices) > 0: + isolated_vtx_and_cids = cudf.DataFrame() + isolated_vtx_and_cids[VERTEX_COL_NAME] = isolated_vertices + isolated_vtx_and_cids[CLUSTER_ID_COL_NAME] = [ + (max_cluster_id + i + 1) for i in range(len(isolated_vertices)) + ] + result_df = cudf.concat( + [result_df, isolated_vtx_and_cids], ignore_index=True, sort=False + ) if G.renumbered: - df = G.unrenumber(df, "vertex") + result_df = G.unrenumber(result_df, VERTEX_COL_NAME) if isNx is True: - df = df_score_to_dictionary(df, "partition") + result_df = df_score_to_dictionary(result_df, CLUSTER_ID_COL_NAME) - return df, mod_score + return result_df, mod_score diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py index e68b5dd4880..7a54a0bf2cf 100644 --- a/python/cugraph/cugraph/utilities/utils.py +++ b/python/cugraph/cugraph/utilities/utils.py @@ -364,8 +364,8 @@ def is_matrix_type(m): return is_cp_matrix_type(m) or is_sp_matrix_type(m) -def is_nx_graph_type(g): - return g in __nx_graph_types +def is_nx_graph_type(graph_type): + return graph_type in __nx_graph_types def is_cugraph_graph_type(g):