From d3b80a2d5baf4bd910d211ef2b9825fb29101302 Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:40:21 +0100 Subject: [PATCH] extract the edgelist from the graph (#4750) This PR exposes the C++ function decompress_to_edgelist to the C, PLC and Python API. This will enable the extraction of the edgelist from a graph which is currently not supported. It also removes the deprecated parameter `legacy_renum_only` Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) URL: https://github.com/rapidsai/cugraph/pull/4750 --- cpp/CMakeLists.txt | 2 + cpp/include/cugraph_c/graph_functions.h | 102 +++++++++++ cpp/src/c_api/decompress_to_edgelist.cpp | 137 ++++++++++++++ cpp/src/c_api/edgelist.cpp | 83 +++++++++ cpp/src/c_api/edgelist.hpp | 34 ++++ .../cugraph/structure/graph_classes.py | 33 ---- .../simpleDistributedGraph.py | 99 ++++++++-- .../graph_implementation/simpleGraph.py | 75 ++++++-- .../cugraph/cugraph/structure/number_map.py | 27 +-- .../cugraph/tests/structure/test_graph.py | 52 ++++++ .../cugraph/tests/structure/test_graph_mg.py | 54 ++++++ .../pylibcugraph/pylibcugraph/CMakeLists.txt | 1 + python/pylibcugraph/pylibcugraph/__init__.py | 2 + .../_cugraph_c/graph_functions.pxd | 65 ++++++- .../pylibcugraph/decompress_to_edgelist.pyx | 169 ++++++++++++++++++ 15 files changed, 840 insertions(+), 95 deletions(-) create mode 100644 cpp/src/c_api/decompress_to_edgelist.cpp create mode 100644 cpp/src/c_api/edgelist.cpp create mode 100644 cpp/src/c_api/edgelist.hpp create mode 100644 python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 27e1999cb75..65772b4f5dd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -538,6 +538,8 @@ add_library(cugraph_c src/c_api/weakly_connected_components.cpp src/c_api/strongly_connected_components.cpp src/c_api/allgather.cpp + src/c_api/decompress_to_edgelist.cpp + src/c_api/edgelist.cpp ) add_library(cugraph::cugraph_c ALIAS cugraph_c) diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h index ff7e439232a..964b2f2c8d6 100644 --- a/cpp/include/cugraph_c/graph_functions.h +++ b/cpp/include/cugraph_c/graph_functions.h @@ -104,6 +104,8 @@ cugraph_error_code_t cugraph_two_hop_neighbors( /** * @brief Opaque induced subgraph type + * + * @deprecated This API will be deleted, use cugraph_edgelist_t */ typedef struct { int32_t align_; @@ -112,6 +114,8 @@ typedef struct { /** * @brief Get the source vertex ids * + * @deprecated This API will be deleted, use cugraph_edgelist_get_sources + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of source vertex ids */ @@ -121,6 +125,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_sources( /** * @brief Get the destination vertex ids * + * @deprecated This API will be deleted, use cugraph_edgelist_get_destinations + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of destination vertex ids */ @@ -130,6 +136,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_destinatio /** * @brief Get the edge weights * + * @deprecated This API will be deleted, use cugraph_edgelist_get_edge_weights + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of edge weights */ @@ -139,6 +147,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_weigh /** * @brief Get the edge ids * + * @deprecated This API will be deleted, use cugraph_edgelist_get_edge_ids + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of edge ids */ @@ -148,6 +158,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_ids( /** * @brief Get the edge types * + * @deprecated This API will be deleted, use cugraph_edgelist_get_edge_type_ids + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of edge types */ @@ -157,6 +169,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_type_ /** * @brief Get the subgraph offsets * + * @deprecated This API will be deleted, use cugraph_edgelist_get_edge_offsets + * * @param [in] induced_subgraph Opaque pointer to induced subgraph * @return type erased array view of subgraph identifiers */ @@ -166,6 +180,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_subgraph_o /** * @brief Free induced subgraph * + * @deprecated This API will be deleted, use cugraph_edgelist_free + * * @param [in] induced subgraph Opaque pointer to induced subgraph */ void cugraph_induced_subgraph_result_free(cugraph_induced_subgraph_result_t* induced_subgraph); @@ -361,6 +377,92 @@ cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees( */ void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result); +/** + * @brief Opaque edgelist type + * + */ +typedef struct { + int32_t align_; +} cugraph_edgelist_t; + +/** + * @brief Get the source vertex ids + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of source vertex ids + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources(cugraph_edgelist_t* edgelist); + +/** + * @brief Get the destination vertex ids + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of destination vertex ids + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations( + cugraph_edgelist_t* edgelist); + +/** + * @brief Get the edge weights + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of edge weights + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights( + cugraph_edgelist_t* edgelist); + +/** + * @brief Get the edge ids + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of edge ids + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids( + cugraph_edgelist_t* edgelist); + +/** + * @brief Get the edge types + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of edge types + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids( + cugraph_edgelist_t* edgelist); + +/** + * @brief Get the edge offsets + * + * @param [in] edgelist Opaque pointer to edgelist + * @return type erased array view of subgraph identifiers + */ +cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets( + cugraph_edgelist_t* edgelist); + +/** + * @brief Free edgelist + * + * @param [in] edgelist Opaque pointer to edgelist + */ +void cugraph_edgelist_free(cugraph_edgelist_t* edgelist); + +/** + * @brief Construct the edge list from the graph view object. + * + * @param [in] handle Handle for accessing resources + * @param [in] graph Graph to operate on + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if set to + * true) + * @param [out] result Opaque pointer to edgelist + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_decompress_to_edgelist(const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + bool_t do_expensive_check, + cugraph_edgelist_t** result, + cugraph_error_t** error); + #ifdef __cplusplus } #endif diff --git a/cpp/src/c_api/decompress_to_edgelist.cpp b/cpp/src/c_api/decompress_to_edgelist.cpp new file mode 100644 index 00000000000..75bf0c0fd60 --- /dev/null +++ b/cpp/src/c_api/decompress_to_edgelist.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_api/abstract_functor.hpp" +#include "c_api/core_result.hpp" +#include "c_api/edgelist.hpp" +#include "c_api/graph.hpp" +#include "c_api/resource_handle.hpp" +#include "c_api/utils.hpp" + +#include + +#include +#include +#include +#include + +#include + +namespace { + +struct decompress_to_edgelist_functor : public cugraph::c_api::abstract_functor { + raft::handle_t const& handle_; + cugraph::c_api::cugraph_graph_t* graph_{}; + + cugraph::c_api::cugraph_core_result_t const* core_result_{}; + bool do_expensive_check_{}; + cugraph::c_api::cugraph_edgelist_t* result_{}; + + decompress_to_edgelist_functor(cugraph_resource_handle_t const* handle, + cugraph_graph_t* graph, + bool do_expensive_check) + : abstract_functor(), + handle_(*reinterpret_cast(handle)->handle_), + graph_(reinterpret_cast(graph)), + do_expensive_check_(do_expensive_check) + { + } + + template + void operator()() + { + if constexpr (!cugraph::is_candidate::value) { + unsupported(); + } else { + if constexpr (store_transposed) { + error_code_ = cugraph::c_api:: + transpose_storage( + handle_, graph_, error_.get()); + if (error_code_ != CUGRAPH_SUCCESS) return; + } + + auto graph = + reinterpret_cast*>( + graph_->graph_); + + auto graph_view = graph->view(); + + auto edge_weights = reinterpret_cast, + weight_t>*>(graph_->edge_weights_); + + auto edge_ids = reinterpret_cast, + edge_t>*>(graph_->edge_ids_); + + auto edge_types = reinterpret_cast, + edge_type_type_t>*>(graph_->edge_types_); + + auto number_map = reinterpret_cast*>(graph_->number_map_); + + auto [result_src, result_dst, result_wgt, result_edge_id, result_edge_type] = + cugraph::decompress_to_edgelist( + handle_, + graph_view, + (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt, + (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt, + (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt, + (number_map != nullptr) ? std::make_optional>( + number_map->data(), number_map->size()) + : std::nullopt, + do_expensive_check_); + + result_ = new cugraph::c_api::cugraph_edgelist_t{ + new cugraph::c_api::cugraph_type_erased_device_array_t(result_src, graph_->vertex_type_), + new cugraph::c_api::cugraph_type_erased_device_array_t(result_dst, graph_->vertex_type_), + result_wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt, + graph_->weight_type_) + : NULL, + result_edge_id ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_edge_id, + graph_->edge_type_) + : NULL, + result_edge_type ? new cugraph::c_api::cugraph_type_erased_device_array_t( + *result_edge_type, graph_->edge_type_id_type_) + : NULL, + NULL}; + } + } +}; + +} // namespace + +extern "C" cugraph_error_code_t cugraph_decompress_to_edgelist( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + bool_t do_expensive_check, + cugraph_edgelist_t** result, + cugraph_error_t** error) +{ + decompress_to_edgelist_functor functor(handle, graph, do_expensive_check); + + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} diff --git a/cpp/src/c_api/edgelist.cpp b/cpp/src/c_api/edgelist.cpp new file mode 100644 index 00000000000..640b2bf2853 --- /dev/null +++ b/cpp/src/c_api/edgelist.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_api/edgelist.hpp" + +#include + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return reinterpret_cast(internal_pointer->src_->view()); +} + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return reinterpret_cast(internal_pointer->dst_->view()); +} + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return (internal_pointer->wgt_ == nullptr) + ? NULL + : reinterpret_cast( + internal_pointer->wgt_->view()); +} + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return (internal_pointer->edge_ids_ == nullptr) + ? NULL + : reinterpret_cast( + internal_pointer->edge_ids_->view()); +} + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return (internal_pointer->edge_type_ids_ == nullptr) + ? NULL + : reinterpret_cast( + internal_pointer->edge_type_ids_->view()); +} + +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets( + cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + return reinterpret_cast( + internal_pointer->subgraph_offsets_->view()); +} + +extern "C" void cugraph_edgelist_free(cugraph_edgelist_t* edgelist) +{ + auto internal_pointer = reinterpret_cast(edgelist); + delete internal_pointer->src_; + delete internal_pointer->dst_; + delete internal_pointer->wgt_; + delete internal_pointer->edge_ids_; + delete internal_pointer->edge_type_ids_; + delete internal_pointer->subgraph_offsets_; + delete internal_pointer; +} diff --git a/cpp/src/c_api/edgelist.hpp b/cpp/src/c_api/edgelist.hpp new file mode 100644 index 00000000000..bc0f2d337f1 --- /dev/null +++ b/cpp/src/c_api/edgelist.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "c_api/array.hpp" + +namespace cugraph { +namespace c_api { + +struct cugraph_edgelist_t { + cugraph_type_erased_device_array_t* src_{}; + cugraph_type_erased_device_array_t* dst_{}; + cugraph_type_erased_device_array_t* wgt_{}; + cugraph_type_erased_device_array_t* edge_ids_{}; + cugraph_type_erased_device_array_t* edge_type_ids_{}; + cugraph_type_erased_device_array_t* subgraph_offsets_{}; +}; + +} // namespace c_api +} // namespace cugraph diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py index 84234f7e904..90f809fa6c1 100644 --- a/python/cugraph/cugraph/structure/graph_classes.py +++ b/python/cugraph/cugraph/structure/graph_classes.py @@ -115,7 +115,6 @@ def from_cudf_edgelist( edge_type=None, renumber=True, store_transposed=False, - legacy_renum_only=False, symmetrize=None, ): """ @@ -168,13 +167,6 @@ def from_cudf_edgelist( If True, stores the transpose of the adjacency matrix. Required for certain algorithms. - legacy_renum_only : bool, optional (default=False) - If True, skips the C++ renumbering step. Must be true for - pylibcugraph algorithms. Must be false for algorithms - not yet converted to the pylibcugraph C API. - - This parameter is deprecated and will be removed. - symmetrize: bool, optional (default=None) If True, symmetrize the edge list for an undirected graph. Setting this flag to True for a directed graph returns an error. The default @@ -210,7 +202,6 @@ def from_cudf_edgelist( edge_type=edge_type, renumber=renumber, store_transposed=store_transposed, - legacy_renum_only=legacy_renum_only, symmetrize=symmetrize, ) @@ -306,7 +297,6 @@ def from_dask_cudf_edgelist( edge_type=None, renumber=True, store_transposed=False, - legacy_renum_only=False, ): """ Initializes the distributed graph from the dask_cudf.DataFrame @@ -353,13 +343,6 @@ def from_dask_cudf_edgelist( If True, stores the transpose of the adjacency matrix. Required for certain algorithms. - legacy_renum_only : bool, optional (default=False) - If True, skips the C++ renumbering step. Must be true for - pylibcugraph algorithms. Must be false for algorithms - not yet converted to the pylibcugraph C API. - - This parameter is deprecated and will be removed. - """ if self._Impl is None: @@ -378,7 +361,6 @@ def from_dask_cudf_edgelist( edge_type=edge_type, renumber=renumber, store_transposed=store_transposed, - legacy_renum_only=legacy_renum_only, ) # Move to Compat Module @@ -869,7 +851,6 @@ def from_cudf_edgelist( edge_attr=None, renumber=True, store_transposed=False, - legacy_renum_only=False, ): """ Initialize a graph from the edge list. It is an error to call this @@ -909,13 +890,6 @@ def from_cudf_edgelist( If True, stores the transpose of the adjacency matrix. Required for certain algorithms. - legacy_renum_only : bool, optional (default=False) - If True, skips the C++ renumbering step. Must be true for - pylibcugraph algorithms. Must be false for algorithms - not yet converted to the pylibcugraph C API. - - This parameter is deprecated and will be removed. - Examples -------- >>> df = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', @@ -945,7 +919,6 @@ def from_dask_cudf_edgelist( edge_attr=None, renumber=True, store_transposed=False, - legacy_renum_only=False, ): """ Initializes the distributed graph from the dask_cudf.DataFrame @@ -980,12 +953,6 @@ def from_dask_cudf_edgelist( If True, stores the transpose of the adjacency matrix. Required for certain algorithms. - legacy_renum_only : bool, optional (default=False) - If True, skips the C++ renumbering step. Must be true for - pylibcugraph algorithms. Must be false for algorithms - not yet converted to the pylibcugraph C API. - - This parameter is deprecated and will be removed. """ raise TypeError("Distributed N-partite graph not supported") diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index 83dad234287..ced72a6bbe2 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -14,6 +14,7 @@ import gc from typing import Union, Iterable import warnings +from typing import Tuple import cudf import cupy as cp @@ -31,6 +32,7 @@ degrees as pylibcugraph_degrees, in_degrees as pylibcugraph_in_degrees, out_degrees as pylibcugraph_out_degrees, + decompress_to_edgelist as pylibcugraph_decompress_to_edgelist, ) from cugraph.structure.number_map import NumberMap @@ -172,7 +174,6 @@ def __from_edgelist( edge_type=None, renumber=True, store_transposed=False, - legacy_renum_only=False, symmetrize=None, ): if not isinstance(input_ddf, dask_cudf.DataFrame): @@ -333,9 +334,7 @@ def __from_edgelist( # the edgelist_df and not do any renumbering. # C++ renumbering is enabled by default for algorithms that # support it (but only called if renumbering is on) - self.compute_renumber_edge_list( - transposed=store_transposed, legacy_renum_only=legacy_renum_only - ) + self.compute_renumber_edge_list(transposed=store_transposed) if renumber is False: self.properties.renumbered = False @@ -979,6 +978,84 @@ def convert_to_cudf(cp_arrays): return ddf + def decompress_to_edgelist( + self, return_unrenumbered_edgelist: bool = True + ) -> dask_cudf.DataFrame: + """ + Extract a the edgelist from a graph. + + Parameters + ---------- + return_unrenumbered_edgelist : bool (default=True) + Flag determining whether to return the original + input edgelist if 'True' or the renumbered one + of 'False' and the edgelist was renumbered. + + Returns + ------- + df : dask_cudf.cudf.DataFrame + Distributed GPU data frame containing all induced sources identifiers, + destination identifiers, and if applicable edge weights, edge ids and + edge types + """ + + # Initialize dask client + client = default_client() + + do_expensive_check = False + + def _call_decompress_to_edgelist( + sID: bytes, + mg_graph_x, + do_expensive_check: bool, + ) -> Tuple[cp.ndarray, cp.ndarray, cp.ndarray, cp.ndarray]: + return pylibcugraph_decompress_to_edgelist( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + do_expensive_check=do_expensive_check, + ) + + result = [ + client.submit( + _call_decompress_to_edgelist, + Comms.get_session_id(), + self._plc_graph[w], + do_expensive_check, + ) + for w in Comms.get_workers() + ] + wait(result) + + def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.DataFrame: + cp_src, cp_dst, cp_weight, cp_edge_ids, cp_edge_type_ids = cp_arrays + + df = cudf.DataFrame() + df["src"] = cp_src + df["dst"] = cp_dst + if cp_weight is not None: + df["weight"] = cp_weight + if cp_edge_ids is not None: + df["edge_ids"] = cp_edge_ids + if cp_edge_type_ids is not None: + df["edge_type_ids"] = cp_edge_type_ids + + return df + + cudf_result = [ + client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result + ] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + if self.properties.renumbered and return_unrenumbered_edgelist: + ddf = self.renumber_map.unrenumber(ddf, "src") + ddf = self.renumber_map.unrenumber(ddf, "dst") + + return ddf + def select_random_vertices( self, random_state: int = None, num_vertices: int = None ) -> Union[dask_cudf.Series, dask_cudf.DataFrame]: @@ -1214,7 +1291,7 @@ def neighbors(self, n): ddf = self.edgelist.edgelist_df return ddf[ddf["src"] == n]["dst"].reset_index(drop=True) - def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False): + def compute_renumber_edge_list(self, transposed=False): """ Compute a renumbered edge list This function works in the MNMG pipeline and will transform @@ -1237,20 +1314,9 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False): structure. If False, renumber with the intent to make a CSR-like structure. Defaults to False. - legacy_renum_only : (optional) bool - if True, The C++ renumbering will not be triggered. - This parameter is added for new algos following the - C/Pylibcugraph path - This parameter is deprecated and will be removed. """ - if legacy_renum_only: - warning_msg = ( - "The parameter 'legacy_renum_only' is deprecated and will be removed." - ) - warnings.warn(warning_msg, DeprecationWarning) - if not self.properties.renumber: self.edgelist = self.EdgeList(self.input_df) self.renumber_map = None @@ -1269,7 +1335,6 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False): self.source_columns, self.destination_columns, store_transposed=transposed, - legacy_renum_only=legacy_renum_only, ) self.edgelist = self.EdgeList(renumbered_ddf) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 858b114ebdc..4523b7f13b8 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -14,6 +14,7 @@ from cugraph.structure import graph_primtypes_wrapper from cugraph.structure.replicate_edgelist import replicate_cudf_dataframe from cugraph.structure.symmetrize import symmetrize as symmetrize_df +from pylibcugraph import decompress_to_edgelist as pylibcugraph_decompress_to_edgelist from cugraph.structure.number_map import NumberMap import cugraph.dask.common.mg_utils as mg_utils import cudf @@ -132,17 +133,9 @@ def __from_edgelist( edge_id=None, edge_type=None, renumber=True, - legacy_renum_only=False, store_transposed=False, symmetrize=None, ): - if legacy_renum_only: - warning_msg = ( - "The parameter 'legacy_renum_only' is deprecated and will be removed." - ) - warnings.warn( - warning_msg, - ) if self.properties.directed and symmetrize: raise ValueError( @@ -266,11 +259,7 @@ def __from_edgelist( if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber( - elist, - source, - destination, - store_transposed=False, - legacy_renum_only=legacy_renum_only, + elist, source, destination, store_transposed=False ) source = renumber_map.renumbered_src_col_name destination = renumber_map.renumbered_dst_col_name @@ -312,6 +301,8 @@ def __from_edgelist( # FIXME: if the user calls self.edgelist.edgelist_df after creating a # symmetric graph, return the symmetric edgelist? + # FIXME: For better memory footprint, avoid storing this edgelist and instead + # call decompress_to_edgelist to extract the edgelist from the graph self.edgelist = simpleGraphImpl.EdgeList( elist[source], elist[destination], value_col ) @@ -804,6 +795,64 @@ def get_two_hop_neighbors(self, start_vertices=None): return df + def decompress_to_edgelist( + self, return_unrenumbered_edgelist: bool = True + ) -> cudf.DataFrame: + """ + Extract a the edgelist from a graph. + + Parameters + ---------- + return_unrenumbered_edgelist : bool (default=True) + Flag determining whether to return the original input edgelist + if 'True' or the renumbered one of 'False' and the edgelist was + renumbered. + + Returns + ------- + + df : cudf.DataFrame + GPU data frame containing all sources identifiers, + destination identifiers and if applicable edge weights, edge ids and + edge types + + Examples + -------- + >>> from cugraph.datasets import karate + >>> G = karate.get_graph(download=True) + >>> edgelist = G.decompress_to_edgelist() + + """ + + do_expensive_check = False + ( + source, + destination, + weight, + edge_ids, + edge_type_ids, + ) = pylibcugraph_decompress_to_edgelist( + resource_handle=ResourceHandle(), + graph=self._plc_graph, + do_expensive_check=do_expensive_check, + ) + + df = cudf.DataFrame() + df["src"] = source + df["dst"] = destination + if weight is not None: + df["weight"] = weight + if edge_ids is not None: + df["edge_ids"] = edge_ids + if edge_type_ids is not None: + df["edge_type_ids"] = edge_type_ids + + if self.properties.renumbered and return_unrenumbered_edgelist: + df, _ = self.renumber_map.unrenumber(df, "src", get_column_names=True) + df, _ = self.renumber_map.unrenumber(df, "dst", get_column_names=True) + + return df + def select_random_vertices( self, random_state: int = None, diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index b0118fee960..39738daff36 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -18,7 +18,6 @@ import dask_cudf import numpy as np import cudf -import warnings class NumberMap: @@ -462,12 +461,7 @@ def from_internal_vertex_id( @staticmethod def renumber_and_segment( - df, - src_col_names, - dst_col_names, - preserve_order=False, - store_transposed=False, - legacy_renum_only=False, + df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False ): """ Given an input dataframe with its column names, this function returns the @@ -475,11 +469,6 @@ def renumber_and_segment( to external vertex IDs. the parameter 'preserve_order' ensures that the order of the edges is preserved during renumbering. """ - if legacy_renum_only: - warning_msg = ( - "The parameter 'legacy_renum_only' is deprecated and will be removed." - ) - warnings.warn(warning_msg, DeprecationWarning) renumbered = False @@ -584,20 +573,10 @@ def renumber_and_segment( @staticmethod def renumber( - df, - src_col_names, - dst_col_names, - preserve_order=False, - store_transposed=False, - legacy_renum_only=False, + df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False ): return NumberMap.renumber_and_segment( - df, - src_col_names, - dst_col_names, - preserve_order, - store_transposed, - legacy_renum_only, + df, src_col_names, dst_col_names, preserve_order, store_transposed )[0:2] def unrenumber(self, df, column_name, preserve_order=False, get_column_names=False): diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index b3e517100e1..6fcfef726b1 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -179,6 +179,58 @@ def test_add_edge_list_to_adj_list(graph_file): assert values_cu is None +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("is_directed", [True, False]) +@pytest.mark.parametrize("renumber", [True, False]) +def test_decompress_to_edgelist(graph_file, is_directed, renumber): + input_df = utils.read_csv_file(graph_file) + input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"}) + + G = cugraph.Graph(directed=is_directed) + input_df_ = cudf.DataFrame() + if renumber: + input_df_["src_0"] = cudf.Series(input_df["src"]) + input_df_["dst_0"] = cudf.Series(input_df["dst"]) + input_df_["weight"] = cudf.Series(input_df["weight"]) + input_df_["src_1"] = input_df_["src_0"] + 1000 + input_df_["dst_1"] = input_df_["dst_0"] + 1000 + + input_df = input_df_ + source = ["src_0", "src_1"] + destination = ["dst_0", "dst_1"] + else: + source = "src" + destination = "dst" + + G.from_cudf_edgelist( + input_df, source=source, destination=destination, weight="weight", renumber=True + ) + + extracted_df = G.decompress_to_edgelist(return_unrenumbered_edgelist=True) + + if renumber: + extracted_df = extracted_df.rename( + columns={ + "0_src": "src_0", + "1_src": "src_1", + "0_dst": "dst_0", + "1_dst": "dst_1", + } + ) + extracted_df = extracted_df.sort_values( + ["src_0", "src_1", "dst_0", "dst_1"] + ).reset_index(drop=True) + input_df = input_df.sort_values( + ["src_0", "src_1", "dst_0", "dst_1"] + ).reset_index(drop=True) + else: + extracted_df = extracted_df.sort_values(["src", "dst"]).reset_index(drop=True) + input_df = input_df.sort_values(["src", "dst"]).reset_index(drop=True) + + assert_frame_equal(input_df, extracted_df, check_dtype=False, check_like=True) + + # Test @pytest.mark.sg @pytest.mark.parametrize("graph_file", utils.DATASETS) diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py index f2cc1583f93..e5eeb0f653b 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py +++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py @@ -420,3 +420,57 @@ def test_graph_creation_properties(dask_client, graph_file, directed, renumber): assert sG.number_of_nodes() == mG.number_of_nodes() assert sG.number_of_edges() == mG.number_of_edges() assert_frame_equal(sG_edgelist_view, mG_edgelist_view, check_dtype=False) + + +@pytest.mark.parametrize("directed", [True, False]) +@pytest.mark.parametrize("renumber", [True, False]) +@pytest.mark.parametrize("graph_file", datasets) +def test_decompress_to_edgelist(dask_client, graph_file, directed, renumber): + input_df = utils.read_csv_file(graph_file) + input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"}) + + G = cugraph.Graph(directed=directed) + input_df_ = cudf.DataFrame() + if renumber: + input_df_["src_0"] = cudf.Series(input_df["src"]) + input_df_["dst_0"] = cudf.Series(input_df["dst"]) + input_df_["weight"] = cudf.Series(input_df["weight"]) + input_df_["src_1"] = input_df_["src_0"] + 1000 + input_df_["dst_1"] = input_df_["dst_0"] + 1000 + + input_df = input_df_ + source = ["src_0", "src_1"] + destination = ["dst_0", "dst_1"] + else: + source = "src" + destination = "dst" + num_workers = len(Comms.get_workers()) + + input_ddf = dask_cudf.from_cudf(input_df, npartitions=num_workers) + + G = cugraph.Graph(directed=True) + G.from_dask_cudf_edgelist( + input_ddf, source=source, destination=destination, weight="weight" + ) + + extracted_df = ( + G.decompress_to_edgelist(return_unrenumbered_edgelist=True) + .compute() + .reset_index(drop=True) + ) + + if renumber: + extracted_df = extracted_df.rename( + columns={ + "0_src": "src_0", + "1_src": "src_1", + "0_dst": "dst_0", + "1_dst": "dst_1", + } + ) + extracted_df = extracted_df.sort_values( + ["src_0", "src_1", "dst_0", "dst_1"] + ).reset_index(drop=True) + input_df = input_df.sort_values( + ["src_0", "src_1", "dst_0", "dst_1"] + ).reset_index(drop=True) diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index fb46030bc56..fe7c4b64aa5 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -70,6 +70,7 @@ set(cython_sources homogeneous_biased_neighbor_sample.pyx homogeneous_uniform_neighbor_sample.pyx edge_id_lookup_table.pyx + decompress_to_edgelist.pyx ) set(linked_libraries cugraph::cugraph;cugraph::cugraph_c) diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 5aa351f9ce1..9047144c13a 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -126,6 +126,8 @@ from pylibcugraph.degrees import in_degrees, out_degrees, degrees +from pylibcugraph.decompress_to_edgelist import decompress_to_edgelist + from pylibcugraph import exceptions diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd index b8f16cb94c8..b27a7230a13 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd @@ -122,41 +122,41 @@ cdef extern from "cugraph_c/graph_functions.h": ########################################################################### # induced_subgraph - ctypedef struct cugraph_induced_subgraph_result_t: + ctypedef struct cugraph_induced_subgraph_result_t: # Deprecated pass cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_sources( + cugraph_induced_subgraph_get_sources( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_destinations( + cugraph_induced_subgraph_get_destinations( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_edge_weights( + cugraph_induced_subgraph_get_edge_weights( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_edge_ids( + cugraph_induced_subgraph_get_edge_ids( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_edge_type_ids( + cugraph_induced_subgraph_get_edge_type_ids( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef cugraph_type_erased_device_array_view_t* \ - cugraph_induced_subgraph_get_subgraph_offsets( + cugraph_induced_subgraph_get_subgraph_offsets( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) cdef void \ - cugraph_induced_subgraph_result_free( + cugraph_induced_subgraph_result_free( # Deprecated cugraph_induced_subgraph_result_t* induced_subgraph ) @@ -250,3 +250,52 @@ cdef extern from "cugraph_c/graph_functions.h": cugraph_degrees_result_free( cugraph_degrees_result_t* degrees_result ) + + ########################################################################### + # decompress to edgelist + ctypedef struct cugraph_edgelist_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_sources( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_destinations( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_edge_weights( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_edge_ids( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_edge_type_ids( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edgelist_get_edge_offsets( + cugraph_edgelist_t* edgelist + ) + + cdef void \ + cugraph_edgelist_free( + cugraph_edgelist_t* edgelist + ) + + cdef cugraph_error_code_t \ + cugraph_decompress_to_edgelist( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + bool_t do_expensive_check, + cugraph_edgelist_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx new file mode 100644 index 00000000000..58c29940aba --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx @@ -0,0 +1,169 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + + +from pylibcugraph._cugraph_c.types cimport ( + bool_t, +) +from pylibcugraph._cugraph_c.resource_handle cimport ( + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_edgelist_t, + cugraph_decompress_to_edgelist, + cugraph_edgelist_get_sources, + cugraph_edgelist_get_destinations, + cugraph_edgelist_get_edge_weights, + cugraph_edgelist_get_edge_ids, + cugraph_edgelist_get_edge_type_ids, + cugraph_edgelist_get_edge_offsets, + cugraph_edgelist_free, +) + +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, +) + + +def decompress_to_edgelist(ResourceHandle resource_handle, + _GPUGraph graph, + bool_t do_expensive_check): + """ + Extract a the edgelist from a graph + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph. + + do_expensive_check : bool_t + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the sources, destinations and if applicable + edge_weights, edge_ids and/or edge_type_ids. + + Examples + -------- + >>> import pylibcugraph, cupy, numpy + >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32) + >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32) + >>> weights = cupy.asarray( + ... [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], dtype=numpy.float32) + >>> resource_handle = pylibcugraph.ResourceHandle() + >>> graph_props = pylibcugraph.GraphProperties( + ... is_symmetric=False, is_multigraph=False) + >>> G = pylibcugraph.SGGraph( + ... resource_handle, graph_props, srcs, dsts, weight_array=weights, + ... store_transposed=False, renumber=False, do_expensive_check=False) + >>> (sources, destinations, edge_weights, _, _) = + ... pylibcugraph.decompress_to_edgelist( + ... resource_handle, G, False) + >>> sources + [0, 1, 1, 2, 2, 2, 3, 4] + >>> destinations + [1, 3, 4, 0, 1, 3, 5, 5] + >>> edge_weights + [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2] + """ + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + cdef cugraph_edgelist_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + error_code = cugraph_decompress_to_edgelist(c_resource_handle_ptr, + c_graph_ptr, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_decompress_to_edgelist") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* sources_ptr = \ + cugraph_edgelist_get_sources(result_ptr) + cdef cugraph_type_erased_device_array_view_t* destinations_ptr = \ + cugraph_edgelist_get_destinations(result_ptr) + cdef cugraph_type_erased_device_array_view_t* edge_weights_ptr = \ + cugraph_edgelist_get_edge_weights(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* edge_ids_ptr = \ + cugraph_edgelist_get_edge_ids(result_ptr) + cdef cugraph_type_erased_device_array_view_t* edge_type_ids_ptr = \ + cugraph_edgelist_get_edge_type_ids(result_ptr) + + + """ + cdef cugraph_type_erased_device_array_view_t* subgraph_offsets_ptr = \ + cugraph_edgelist_get_edge_offsets(result_ptr) + """ + + # FIXME: Get ownership of the result data instead of performing a copy + # for perfomance improvement + cupy_edge_weights = None + cupy_edge_ids = None + cupy_edge_type_ids = None + cupy_sources = copy_to_cupy_array( + c_resource_handle_ptr, sources_ptr) + cupy_destinations = copy_to_cupy_array( + c_resource_handle_ptr, destinations_ptr) + if edge_weights_ptr != NULL: + cupy_edge_weights = copy_to_cupy_array( + c_resource_handle_ptr, edge_weights_ptr) + if edge_ids_ptr != NULL: + cupy_edge_ids = copy_to_cupy_array( + c_resource_handle_ptr, edge_ids_ptr) + if edge_type_ids_ptr != NULL: + cupy_edge_type_ids = copy_to_cupy_array( + c_resource_handle_ptr, edge_type_ids_ptr) + + """ + cupy_subgraph_offsets = copy_to_cupy_array( + c_resource_handle_ptr, subgraph_offsets_ptr) + """ + + # Free pointer + cugraph_edgelist_free(result_ptr) + + return (cupy_sources, cupy_destinations, + cupy_edge_weights, cupy_edge_ids, cupy_edge_type_ids)