diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 707c61e4d3e..1976d8ff46f 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -30,7 +30,23 @@ if [[ ${package_name} == "nx-cugraph" ]] || \ [[ ${package_name} == "cugraph-equivariant" ]]; then RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 dist else + case "${RAPIDS_CUDA_VERSION}" in + 12.*) + EXCLUDE_ARGS=( + --exclude "libcublas.so.12" + --exclude "libcublasLt.so.12" + --exclude "libcurand.so.10" + --exclude "libcusolver.so.11" + --exclude "libcusparse.so.12" + --exclude "libnvJitLink.so.12" + ) + ;; + 11.*) + EXCLUDE_ARGS=() + ;; + esac + mkdir -p final_dist - python -m auditwheel repair -w final_dist dist/* + python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/* RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist fi diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh index 6f1b23923ff..20d9bf47e3e 100755 --- a/ci/build_wheel_cugraph.sh +++ b/ci/build_wheel_cugraph.sh @@ -19,8 +19,16 @@ export PIP_CONSTRAINT="${PWD}/constraints.txt" PARALLEL_LEVEL=$(python -c \ "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))") +case "${RAPIDS_CUDA_VERSION}" in + 12.*) + EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON" + ;; + 11.*) + EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF" + ;; +esac -export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/" +export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}" export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}" ./ci/build_wheel.sh cugraph python/cugraph diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh index ee33ab4a82d..fa967b0be29 100755 --- a/ci/build_wheel_pylibcugraph.sh +++ b/ci/build_wheel_pylibcugraph.sh @@ -6,7 +6,16 @@ set -euo pipefail PARALLEL_LEVEL=$(python -c \ "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))") -export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/" +case "${RAPIDS_CUDA_VERSION}" in + 12.*) + EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON" + ;; + 11.*) + EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF" + ;; +esac + +export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}" export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}" ./ci/build_wheel.sh pylibcugraph python/pylibcugraph diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 441627cabce..26b710247f6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -332,6 +332,12 @@ set(CUGRAPH_SOURCES src/sampling/neighbor_sampling_sg_v32_e64.cpp src/sampling/neighbor_sampling_sg_v32_e32.cpp src/sampling/neighbor_sampling_sg_v64_e64.cpp + src/sampling/negative_sampling_sg_v32_e64.cu + src/sampling/negative_sampling_sg_v32_e32.cu + src/sampling/negative_sampling_sg_v64_e64.cu + src/sampling/negative_sampling_mg_v32_e64.cu + src/sampling/negative_sampling_mg_v32_e32.cu + src/sampling/negative_sampling_mg_v64_e64.cu src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu src/sampling/sampling_post_processing_sg_v64_e64.cu @@ -483,6 +489,7 @@ set(CUGRAPH_SOURCES src/centrality/betweenness_centrality_mg_v32_e32.cu src/centrality/betweenness_centrality_mg_v32_e64.cu src/tree/legacy/mst.cu + src/from_cugraph_ops/sampling_index.cu src/components/weakly_connected_components_sg_v64_e64.cu src/components/weakly_connected_components_sg_v32_e32.cu src/components/weakly_connected_components_sg_v32_e64.cu @@ -656,6 +663,7 @@ add_library(cugraph_c src/c_api/louvain.cpp src/c_api/triangle_count.cpp src/c_api/neighbor_sampling.cpp + src/c_api/negative_sampling.cpp src/c_api/labeling_result.cpp src/c_api/weakly_connected_components.cpp src/c_api/strongly_connected_components.cpp diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index 8ba39fa2328..faeb7ad8f83 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -1684,6 +1684,8 @@ node2vec_random_walks(raft::handle_t const& handle, * list of vertices and sample size per vertex. The output graph consists of the given * vertices with each vertex having at most `sample_size` neighbors from the original graph * + * @deprecated This API will be deprecated. uniform_neighbor_sample can be used instead. + * * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and * single-gpu). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -1714,6 +1716,8 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle, * list of vertices and sample size per vertex. The output graph consists of the given * vertices with each vertex having at most `sample_size` neighbors from the original graph * + * @deprecated This API will be deprecated. uniform_neighbor_sample can be used instead. + * * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and * single-gpu). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index cbb52ef3b1e..a2ff3166fa4 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -636,7 +636,7 @@ class graph_view_t edge_srcs, raft::device_span edge_dsts, - bool do_expensive_check = false); + bool do_expensive_check = false) const; rmm::device_uvector compute_multiplicity( raft::handle_t const& handle, @@ -945,7 +945,7 @@ class graph_view_t has_edge(raft::handle_t const& handle, raft::device_span edge_srcs, raft::device_span edge_dsts, - bool do_expensive_check = false); + bool do_expensive_check = false) const; rmm::device_uvector compute_multiplicity(raft::handle_t const& handle, raft::device_span edge_srcs, diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp index fec1a07604e..4e5596d06e0 100644 --- a/cpp/include/cugraph/sampling_functions.hpp +++ b/cpp/include/cugraph/sampling_functions.hpp @@ -743,4 +743,61 @@ lookup_endpoints_from_edge_ids_and_types( raft::device_span edge_ids_to_lookup, raft::device_span edge_types_to_lookup); +/** + * @brief Negative Sampling + * + * This function generates negative samples for graph. + * + * Negative sampling is done by generating a random graph according to the specified + * parameters and optionally removing samples that represent actual edges in the graph + * + * Sampling occurs by creating a list of source vertex ids from biased samping + * of the source vertex space, and destination vertex ids from biased sampling of the + * destination vertex space, and using this as the putative list of edges. We + * then can optionally remove duplicates and remove actual edges in the graph to generate + * the final list. If necessary we will repeat the process to end with a resulting + * edge list of the appropriate size. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if + * true) are major indices + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Graph View object to generate NBR Sampling for + * @param rng_state RNG state + * @param src_biases Optional bias for randomly selecting source vertices. If std::nullopt vertices + * will be selected uniformly. In multi-GPU environment the biases should be partitioned based + * on the vertex partitions. + * @param dst_biases Optional bias for randomly selecting destination vertices. If std::nullopt + * vertices will be selected uniformly. In multi-GPU environment the biases should be partitioned + * based on the vertex partitions. + * @param num_samples Number of negative samples to generate + * @param remove_duplicates If true, remove duplicate samples + * @param remove_existing_edges If true, remove samples that are actually edges in the graph + * @param exact_number_of_samples If true, repeat generation until we get the exact number of + * negative samples + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + * + * @return tuple containing source vertex ids and destination vertex ids for the negative samples + */ +template +std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_biases, + std::optional> dst_biases, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/include/cugraph_c/coo.h b/cpp/include/cugraph_c/coo.h new file mode 100644 index 00000000000..ef746c6ed6a --- /dev/null +++ b/cpp/include/cugraph_c/coo.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Opaque COO definition + */ +typedef struct { + int32_t align_; +} cugraph_coo_t; + +/** + * @brief Opaque COO list definition + */ +typedef struct { + int32_t align_; +} cugraph_coo_list_t; + +/** + * @brief Get the source vertex ids + * + * @param [in] coo Opaque pointer to COO + * @return type erased array view of source vertex ids + */ +cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo); + +/** + * @brief Get the destination vertex ids + * + * @param [in] coo Opaque pointer to COO + * @return type erased array view of destination vertex ids + */ +cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo); + +/** + * @brief Get the edge weights + * + * @param [in] coo Opaque pointer to COO + * @return type erased array view of edge weights, NULL if no edge weights in COO + */ +cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo); + +/** + * @brief Get the edge id + * + * @param [in] coo Opaque pointer to COO + * @return type erased array view of edge id, NULL if no edge ids in COO + */ +cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo); + +/** + * @brief Get the edge type + * + * @param [in] coo Opaque pointer to COO + * @return type erased array view of edge type, NULL if no edge types in COO + */ +cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo); + +/** + * @brief Get the number of coo object in the list + * + * @param [in] coo_list Opaque pointer to COO list + * @return number of elements + */ +size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list); + +/** + * @brief Get a COO from the list + * + * @param [in] coo_list Opaque pointer to COO list + * @param [in] index Index of desired COO from list + * @return a cugraph_coo_t* object from the list + */ +cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index); + +/** + * @brief Free coo object + * + * @param [in] coo Opaque pointer to COO + */ +void cugraph_coo_free(cugraph_coo_t* coo); + +/** + * @brief Free coo list + * + * @param [in] coo_list Opaque pointer to list of COO objects + */ +void cugraph_coo_list_free(cugraph_coo_list_t* coo_list); + +#ifdef __cplusplus +} +#endif diff --git a/cpp/include/cugraph_c/graph_generators.h b/cpp/include/cugraph_c/graph_generators.h index 272131d2aab..553be530e95 100644 --- a/cpp/include/cugraph_c/graph_generators.h +++ b/cpp/include/cugraph_c/graph_generators.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -27,91 +28,6 @@ extern "C" { typedef enum { POWER_LAW = 0, UNIFORM } cugraph_generator_distribution_t; -/** - * @brief Opaque COO definition - */ -typedef struct { - int32_t align_; -} cugraph_coo_t; - -/** - * @brief Opaque COO list definition - */ -typedef struct { - int32_t align_; -} cugraph_coo_list_t; - -/** - * @brief Get the source vertex ids - * - * @param [in] coo Opaque pointer to COO - * @return type erased array view of source vertex ids - */ -cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo); - -/** - * @brief Get the destination vertex ids - * - * @param [in] coo Opaque pointer to COO - * @return type erased array view of destination vertex ids - */ -cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo); - -/** - * @brief Get the edge weights - * - * @param [in] coo Opaque pointer to COO - * @return type erased array view of edge weights, NULL if no edge weights in COO - */ -cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo); - -/** - * @brief Get the edge id - * - * @param [in] coo Opaque pointer to COO - * @return type erased array view of edge id, NULL if no edge ids in COO - */ -cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo); - -/** - * @brief Get the edge type - * - * @param [in] coo Opaque pointer to COO - * @return type erased array view of edge type, NULL if no edge types in COO - */ -cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo); - -/** - * @brief Get the number of coo object in the list - * - * @param [in] coo_list Opaque pointer to COO list - * @return number of elements - */ -size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list); - -/** - * @brief Get a COO from the list - * - * @param [in] coo_list Opaque pointer to COO list - * @param [in] index Index of desired COO from list - * @return a cugraph_coo_t* object from the list - */ -cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index); - -/** - * @brief Free coo object - * - * @param [in] coo Opaque pointer to COO - */ -void cugraph_coo_free(cugraph_coo_t* coo); - -/** - * @brief Free coo list - * - * @param [in] coo_list Opaque pointer to list of COO objects - */ -void cugraph_coo_list_free(cugraph_coo_list_t* coo_list); - /** * @brief Generate RMAT edge list * diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index 1a3d20b9339..bb26e577915 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -674,6 +675,57 @@ cugraph_error_code_t cugraph_select_random_vertices(const cugraph_resource_handl cugraph_type_erased_device_array_t** vertices, cugraph_error_t** error); +/** + * @ingroup samplingC + * @brief Perform negative sampling + * + * Negative sampling generates a COO structure defining edges according to the specified parameters + * + * @param [in] handle Handle for accessing resources + * @param [in,out] rng_state State of the random number generator, updated with each + * call + * @param [in] graph Pointer to graph + * @param [in] vertices Vertex ids for the source biases. If @p src_bias and + * @p dst_bias are not specified this is ignored. If + * @p vertices is specified then vertices[i] is the vertex + * id of src_biases[i] and dst_biases[i]. If @p vertices + * is not specified then i is the vertex id if src_biases[i] + * and dst_biases[i] + * @param [in] src_biases Bias for selecting source vertices. If NULL, do uniform + * sampling, if provided probability of vertex i will be + * src_bias[i] / (sum of all source biases) + * @param [in] dst_biases Bias for selecting destination vertices. If NULL, do + * uniform sampling, if provided probability of vertex i + * will be dst_bias[i] / (sum of all destination biases) + * @param [in] num_samples Number of negative samples to generate + * @param [in] remove_duplicates If true, remove duplicates from sampled edges + * @param [in] remove_existing_edges If true, remove sampled edges that actually exist in + * the graph + * @param [in] exact_number_of_samples If true, result should contain exactly @p num_samples. If + * false the code will generate @p num_samples and then do + * any filtering as specified + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if + * set to true) + * @param [out] result Opaque pointer to generated coo list + * @param [out] error Pointer to an error object storing details of any error. + * Will be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_negative_sampling( + const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + const cugraph_type_erased_device_array_view_t* src_biases, + const cugraph_type_erased_device_array_view_t* dst_biases, + size_t num_samples, + bool_t remove_duplicates, + bool_t remove_existing_edges, + bool_t exact_number_of_samples, + bool_t do_expensive_check, + cugraph_coo_t** result, + cugraph_error_t** error); + #ifdef __cplusplus } #endif diff --git a/cpp/src/c_api/coo.hpp b/cpp/src/c_api/coo.hpp new file mode 100644 index 00000000000..a83a3af375a --- /dev/null +++ b/cpp/src/c_api/coo.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_api/array.hpp" + +#include + +namespace cugraph { +namespace c_api { + +struct cugraph_coo_t { + std::unique_ptr src_{}; + std::unique_ptr dst_{}; + std::unique_ptr wgt_{}; + std::unique_ptr id_{}; + std::unique_ptr type_{}; +}; + +struct cugraph_coo_list_t { + std::vector> list_; +}; + +} // namespace c_api +} // namespace cugraph diff --git a/cpp/src/c_api/graph_generators.cpp b/cpp/src/c_api/graph_generators.cpp index ef478e57098..7601f1508f9 100644 --- a/cpp/src/c_api/graph_generators.cpp +++ b/cpp/src/c_api/graph_generators.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "c_api/array.hpp" +#include "c_api/coo.hpp" #include "c_api/error.hpp" #include "c_api/random.hpp" #include "c_api/resource_handle.hpp" @@ -26,24 +27,6 @@ #include -namespace cugraph { -namespace c_api { - -struct cugraph_coo_t { - std::unique_ptr src_{}; - std::unique_ptr dst_{}; - std::unique_ptr wgt_{}; - std::unique_ptr id_{}; - std::unique_ptr type_{}; -}; - -struct cugraph_coo_list_t { - std::vector> list_; -}; - -} // namespace c_api -} // namespace cugraph - namespace { template diff --git a/cpp/src/c_api/negative_sampling.cpp b/cpp/src/c_api/negative_sampling.cpp new file mode 100644 index 00000000000..54f465d67b4 --- /dev/null +++ b/cpp/src/c_api/negative_sampling.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_api/abstract_functor.hpp" +#include "c_api/coo.hpp" +#include "c_api/graph.hpp" +#include "c_api/random.hpp" +#include "c_api/resource_handle.hpp" +#include "c_api/utils.hpp" + +#include + +#include +#include +#include +#include + +#include + +namespace { + +struct negative_sampling_functor : public cugraph::c_api::abstract_functor { + raft::handle_t const& handle_; + cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr}; + cugraph::c_api::cugraph_graph_t* graph_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* vertices_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* src_biases_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* dst_biases_{nullptr}; + size_t num_samples_; + bool remove_duplicates_{false}; + bool remove_existing_edges_{false}; + bool exact_number_of_samples_{false}; + bool do_expensive_check_{false}; + cugraph::c_api::cugraph_coo_t* result_{nullptr}; + + negative_sampling_functor(const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + const cugraph_type_erased_device_array_view_t* src_biases, + const cugraph_type_erased_device_array_view_t* dst_biases, + size_t num_samples, + bool_t remove_duplicates, + bool_t remove_existing_edges, + bool_t exact_number_of_samples, + bool_t do_expensive_check) + : abstract_functor(), + handle_(*reinterpret_cast(handle)->handle_), + rng_state_(reinterpret_cast(rng_state)), + graph_(reinterpret_cast(graph)), + vertices_( + reinterpret_cast(vertices)), + src_biases_(reinterpret_cast( + src_biases)), + dst_biases_(reinterpret_cast( + dst_biases)), + num_samples_(num_samples), + remove_duplicates_(remove_duplicates), + remove_existing_edges_(remove_existing_edges), + exact_number_of_samples_(exact_number_of_samples), + do_expensive_check_(do_expensive_check) + { + } + + template + void operator()() + { + // FIXME: Think about how to handle SG vice MG + if constexpr (!cugraph::is_candidate::value) { + unsupported(); + } else { + // negative_sampling expects store_transposed == false + if constexpr (store_transposed) { + error_code_ = cugraph::c_api:: + transpose_storage( + handle_, graph_, error_.get()); + if (error_code_ != CUGRAPH_SUCCESS) return; + } + + auto graph = + reinterpret_cast*>(graph_->graph_); + + auto graph_view = graph->view(); + + auto number_map = reinterpret_cast*>(graph_->number_map_); + + rmm::device_uvector vertices(0, handle_.get_stream()); + rmm::device_uvector src_biases(0, handle_.get_stream()); + rmm::device_uvector dst_biases(0, handle_.get_stream()); + + if (src_biases_ != nullptr) { + vertices.resize(vertices_->size_, handle_.get_stream()); + src_biases.resize(src_biases_->size_, handle_.get_stream()); + + raft::copy( + vertices.data(), vertices_->as_type(), vertices.size(), handle_.get_stream()); + raft::copy(src_biases.data(), + src_biases_->as_type(), + src_biases.size(), + handle_.get_stream()); + + src_biases = cugraph::detail:: + collect_local_vertex_values_from_ext_vertex_value_pairs( + handle_, + std::move(vertices), + std::move(src_biases), + *number_map, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last(), + weight_t{0}, + do_expensive_check_); + } + + if (dst_biases_ != nullptr) { + vertices.resize(vertices_->size_, handle_.get_stream()); + dst_biases.resize(dst_biases_->size_, handle_.get_stream()); + + raft::copy( + vertices.data(), vertices_->as_type(), vertices.size(), handle_.get_stream()); + raft::copy(dst_biases.data(), + dst_biases_->as_type(), + dst_biases.size(), + handle_.get_stream()); + + dst_biases = cugraph::detail:: + collect_local_vertex_values_from_ext_vertex_value_pairs( + handle_, + std::move(vertices), + std::move(dst_biases), + *number_map, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last(), + weight_t{0}, + do_expensive_check_); + } + + auto&& [src, dst] = cugraph::negative_sampling( + handle_, + rng_state_->rng_state_, + graph_view, + (src_biases_ != nullptr) ? std::make_optional(raft::device_span{ + src_biases.data(), src_biases.size()}) + : std::nullopt, + (dst_biases_ != nullptr) ? std::make_optional(raft::device_span{ + dst_biases.data(), dst_biases.size()}) + : std::nullopt, + num_samples_, + remove_duplicates_, + remove_existing_edges_, + exact_number_of_samples_, + do_expensive_check_); + + std::vector vertex_partition_lasts = graph_view.vertex_partition_range_lasts(); + + cugraph::unrenumber_int_vertices(handle_, + src.data(), + src.size(), + number_map->data(), + vertex_partition_lasts, + do_expensive_check_); + + cugraph::unrenumber_int_vertices(handle_, + dst.data(), + dst.size(), + number_map->data(), + vertex_partition_lasts, + do_expensive_check_); + + result_ = new cugraph::c_api::cugraph_coo_t{ + std::make_unique(src, + graph_->vertex_type_), + std::make_unique(dst, + graph_->vertex_type_), + nullptr, + nullptr, + nullptr}; + } + } +}; + +} // namespace + +cugraph_error_code_t cugraph_negative_sampling( + const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + const cugraph_type_erased_device_array_view_t* src_biases, + const cugraph_type_erased_device_array_view_t* dst_biases, + size_t num_samples, + bool_t remove_duplicates, + bool_t remove_existing_edges, + bool_t exact_number_of_samples, + bool_t do_expensive_check, + cugraph_coo_t** result, + cugraph_error_t** error) +{ + negative_sampling_functor functor{handle, + rng_state, + graph, + vertices, + src_biases, + dst_biases, + num_samples, + remove_duplicates, + remove_existing_edges, + exact_number_of_samples, + do_expensive_check}; + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} diff --git a/cpp/src/community/egonet_impl.cuh b/cpp/src/community/egonet_impl.cuh index 8b942be5b6a..c7945831ba8 100644 --- a/cpp/src/community/egonet_impl.cuh +++ b/cpp/src/community/egonet_impl.cuh @@ -17,8 +17,6 @@ // #define TIMING -#include "utilities/graph_utils.cuh" - #include #include #include diff --git a/cpp/src/components/legacy/connectivity.cu b/cpp/src/components/legacy/connectivity.cu index ecaaab173db..4d0198fdff6 100644 --- a/cpp/src/components/legacy/connectivity.cu +++ b/cpp/src/components/legacy/connectivity.cu @@ -15,7 +15,6 @@ */ #include "scc_matrix.cuh" -#include "utilities/graph_utils.cuh" #include "weak_cc.cuh" #include diff --git a/cpp/src/detail/utility_wrappers_32.cu b/cpp/src/detail/utility_wrappers_32.cu index 6ab5ae375ca..72dee4a19a5 100644 --- a/cpp/src/detail/utility_wrappers_32.cu +++ b/cpp/src/detail/utility_wrappers_32.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "detail/utility_wrappers.cuh" +#include "detail/utility_wrappers_impl.cuh" #include #include diff --git a/cpp/src/detail/utility_wrappers_64.cu b/cpp/src/detail/utility_wrappers_64.cu index a12bc3e952d..e7254d97c4d 100644 --- a/cpp/src/detail/utility_wrappers_64.cu +++ b/cpp/src/detail/utility_wrappers_64.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "detail/utility_wrappers.cuh" +#include "detail/utility_wrappers_impl.cuh" #include #include diff --git a/cpp/src/detail/utility_wrappers.cuh b/cpp/src/detail/utility_wrappers_impl.cuh similarity index 100% rename from cpp/src/detail/utility_wrappers.cuh rename to cpp/src/detail/utility_wrappers_impl.cuh diff --git a/cpp/src/from_cugraph_ops/algo_R.cuh b/cpp/src/from_cugraph_ops/algo_R.cuh new file mode 100644 index 00000000000..031a7d2ceb9 --- /dev/null +++ b/cpp/src/from_cugraph_ops/algo_R.cuh @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "device.cuh" + +#include + +#include +#include +#include + +#include + +namespace cugraph::ops::graph { + +// single warp-separated field of type IdxT +template +using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>; + +template +__device__ __forceinline__ void warp_algo_r_index(IdxT* smem, + IdxT pop_size, + IdxT idx_offset, + int sample_size, + raft::random::DeviceState& rng_state) +{ + auto lane = utils::lane_id(); + // first 'sample_size' are just copied + CUGRAPH_OPS_UNROLL + for (int i = lane; i < sample_size; i += utils::WARP_SIZE) { + smem[i] = idx_offset + i; + } + auto sample_size_idxt = IdxT{sample_size}; + if (sample_size_idxt >= pop_size) return; + + // we must synchronize here since we have just written to smem + utils::warp_sync(); + // TODO(mjoux): when we support more warps per node enable this + //__syncthreads(); + + auto idx_end = idx_offset + pop_size; + auto n = idx_offset + sample_size_idxt; + auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x}; + GenT gen(rng_state, flat_id); + CUGRAPH_OPS_UNROLL + for (auto nidx = n + IdxT{lane}; nidx < idx_end; nidx += IdxT{utils::WARP_SIZE}) { + // nidx - idx_offset inclusive (necessary for correctness of algo R) + auto end = nidx - idx_offset + 1; + raft::random::UniformIntDistParams int_params{}; + int_params.start = IdxT{0}; + int_params.end = IdxT{end}; + int_params.diff = static_cast(end); + IdxT idx; + raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */); + if (idx < sample_size_idxt) { + // using atomic max instead of exch here because it leads to the same + // output as the sequential algorithm (DGL does this, too) + // Additionally, we use the index instead of the neighbor ID here + // since this allows copying over other node/edge-related data + // (useful for heterogeneous graphs for example) + utils::atomic_max(smem + idx, nidx); + } + } + // must synchronize to make smem valid + utils::warp_sync(); + // TODO(mjoux): when we support more warps per node enable this + //__syncthreads(); +} + +template +__device__ __forceinline__ void warp_algo_r(IdxT* smem, + IdxT row_id, + const IdxT* nodes, + const IdxT* fg_offsets, + int sample_size, + IdxT& node_id, + IdxT& node_start, + IdxT& node_end, + raft::random::DeviceState& rng_state) +{ + auto lane = utils::lane_id(); + if (nodes == nullptr) { + node_id = row_id; + if (lane == 0) + node_start = fg_offsets[node_id]; + else if (lane == 1) + node_end = fg_offsets[node_id + 1]; + node_start = utils::shfl(node_start, 0); + node_end = utils::shfl(node_end, 1); + } else { + if (lane == 0) { + node_id = nodes[row_id]; + node_start = fg_offsets[node_id]; + node_end = fg_offsets[node_id + 1]; + } + node_id = utils::shfl(node_id, 0); + node_start = utils::shfl(node_start, 0); + node_end = utils::shfl(node_end, 0); + } + auto pop_size = node_end - node_start; + warp_algo_r_index(smem, pop_size, node_start, sample_size, rng_state); +} + +// TODO(mjoux): support configuring n_warps_per_node in template +template +CUGRAPH_OPS_KERNEL void algo_r_kernel(raft::random::DeviceState rng_state, + IdxT* neighbors, + IdxT* counts, + // edge_types / node_types should be non-const + // probably detected if `!IS_HG` + // NOLINTNEXTLINE(readability-non-const-parameter) + int32_t* edge_types, + // NOLINTNEXTLINE(readability-non-const-parameter) + int32_t* node_types, + const IdxT* offsets, + const IdxT* indices, + const int32_t* g_edge_types, + const int32_t* g_node_types, + const IdxT* nodes, + IdxT n_dst_nodes, + int sample_size) +{ + auto lane = utils::lane_id(); + auto warp = utils::warp_id(); // 1D block with X dim + auto row_id = warp + static_cast(blockIdx.x) * IdxT{N_WARPS}; + if (row_id >= n_dst_nodes) { return; } + IdxT* s_idx; + smem_algo_r_t smem{}; + int32_t smem_sizes[] = {sample_size}; + smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx); + IdxT node_id, node_start, node_end; + warp_algo_r( + s_idx, row_id, nodes, offsets, sample_size, node_id, node_start, node_end, rng_state); + + IdxT count = 0; + for (int i = lane; i < sample_size; i += utils::WARP_SIZE) { + auto nidx = s_idx[i]; + // checking for node_end here because sample_size may be larger than + // the total number of neighbors of the node + auto val = nidx < node_end ? indices[nidx] : cugraph::invalid_idx::value; + // TODO(mjoux) it's possible that we break the ELLPACK format here since + // if we set val to invalid, we should add it to end of list, rather + // than simply at index "i". This is ignored for now since the case + // where SAMPLE_SELF := false is rare and unconventional + if (!SAMPLE_SELF && val == node_id) val = cugraph::invalid_idx::value; + auto local_id = row_id * IdxT{sample_size} + i; + neighbors[local_id] = val; + if (val != cugraph::invalid_idx::value) { + ++count; + if (IS_HG) edge_types[local_id] = g_edge_types[nidx]; + } + } + if (IS_HG && lane == 0) node_types[row_id] = g_node_types[node_id]; + if (counts != nullptr) { + count = utils::warp_reduce(count); + if (lane == 0) { counts[row_id] = count; } + } +} + +template +void algo_r_impl(IdxT* neighbors, + IdxT* counts, + int32_t* edge_types, + int32_t* node_types, + raft::random::RngState& rng, + const IdxT* offsets, + const IdxT* indices, + const int32_t* g_edge_types, + const int32_t* g_node_types, + const IdxT* nodes, + IdxT n_dst_nodes, + IdxT g_n_dst_nodes, + IdxT sample_size, + IdxT max_val, + cudaStream_t stream) +{ + if (nodes == nullptr) { n_dst_nodes = g_n_dst_nodes; } + ASSERT(n_dst_nodes <= g_n_dst_nodes, + "Algo R: expected n_dst_nodes <= graph.n_dst_nodes (%ld > %ld)", + long(n_dst_nodes), + long(g_n_dst_nodes)); + ASSERT( + static_cast(sample_size) + 2 < static_cast(std::numeric_limits::max()), + "Expected sample size [+2] to be lower than INT_MAX"); + static constexpr int TPB = 512; + static constexpr int N_WARPS = TPB / utils::WARP_SIZE; + auto n_blks = utils::ceil_div(n_dst_nodes, N_WARPS); + int sample_size_i = static_cast(sample_size); + int32_t smem_sizes[] = {sample_size_i}; + size_t smem_size = smem_algo_r_t::get_size(N_WARPS, smem_sizes); + if (static_cast(max_val) < std::numeric_limits::max()) { + // we'll use the 32-bit based method for generating random integers + // as we most likely do not need less bias + RAFT_CALL_RNG_FUNC( + rng, + (algo_r_kernel<<>>), + neighbors, + counts, + edge_types, + node_types, + offsets, + indices, + g_edge_types, + g_node_types, + nodes, + n_dst_nodes, + sample_size_i); + } else { + RAFT_CALL_RNG_FUNC( + rng, + (algo_r_kernel<<>>), + neighbors, + counts, + edge_types, + node_types, + offsets, + indices, + g_edge_types, + g_node_types, + nodes, + n_dst_nodes, + sample_size_i); + } + // update the rng state (this is a pessimistic update as it is difficult to + // compute the number of RNG calls done per thread!) + auto thread_rs = utils::ceil_div( + std::max(IdxT{0}, std::min(max_val, g_n_dst_nodes) - sample_size), utils::WARP_SIZE); + rng.advance(static_cast(n_blks * TPB), thread_rs); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +} // namespace cugraph::ops::graph diff --git a/cpp/src/from_cugraph_ops/device.cuh b/cpp/src/from_cugraph_ops/device.cuh new file mode 100644 index 00000000000..f7d37c62f35 --- /dev/null +++ b/cpp/src/from_cugraph_ops/device.cuh @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "device_atomics.cuh" +#include "device_core.hpp" +#include "device_dim.cuh" +#include "device_smem_helper.cuh" +#include "device_warp_collectives.cuh" +#include "macros.hpp" diff --git a/cpp/src/from_cugraph_ops/device_atomics.cuh b/cpp/src/from_cugraph_ops/device_atomics.cuh new file mode 100644 index 00000000000..b8be7614284 --- /dev/null +++ b/cpp/src/from_cugraph_ops/device_atomics.cuh @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include +#include + +#include + +namespace cugraph::ops::utils { + +/** + * @defgroup AtomicMax Device atomic max operation + * + * @{ + */ +template +__device__ inline DataT atomic_max(DataT* address, DataT val) +{ + return atomicMax(address, val); +} +template <> +__device__ inline float atomic_max(float* address, float val) +{ + using u32_t = unsigned int; + auto* address_as_u32 = reinterpret_cast(address); + u32_t old = *address_as_u32, assumed; + do { + assumed = old; + old = atomicCAS(address_as_u32, assumed, __float_as_uint(max(val, __uint_as_float(assumed)))); + } while (assumed != old); + return __uint_as_float(old); +} +template <> +__device__ inline double atomic_max(double* address, double val) +{ + using u64_t = unsigned long long; // NOLINT(google-runtime-int) + auto* address_as_ull = reinterpret_cast(address); + u64_t old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed)))); + } while (assumed != old); + return __longlong_as_double(old); +} +template <> +__device__ inline int64_t atomic_max(int64_t* address, int64_t val) +{ + using u64_t = unsigned long long; // NOLINT(google-runtime-int) + auto* val_as_u64 = reinterpret_cast(&val); + auto* address_as_u64 = reinterpret_cast(address); + auto ret = atomicMax(address_as_u64, *val_as_u64); + return *reinterpret_cast(&ret); +} +template <> +__device__ inline uint64_t atomic_max(uint64_t* address, uint64_t val) +{ + using u64_t = unsigned long long; // NOLINT(google-runtime-int) + auto* val_as_u64 = reinterpret_cast(&val); + auto* address_as_u64 = reinterpret_cast(address); + auto ret = atomicMax(address_as_u64, *val_as_u64); + return *reinterpret_cast(&ret); +} +/** @} */ + +} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_core.hpp b/cpp/src/from_cugraph_ops/device_core.hpp new file mode 100644 index 00000000000..b548d2d4d1f --- /dev/null +++ b/cpp/src/from_cugraph_ops/device_core.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "macros.hpp" + +namespace cugraph::ops::utils { + +/** number of threads per warp */ +static constexpr int WARP_SIZE = 32; + +/** minimum CUDA version required for warp shfl sync functions */ +static constexpr int CUDA_VER_WARP_SHFL = 9000; + +/** + * @brief Provide a ceiling division operation ie. ceil(a / b) + * + * @tparam IntT supposed to be only integers for now! + * + * @param[in] a dividend + * @param[in] b divisor + */ +template +constexpr CUGRAPH_OPS_HD IntT ceil_div(IntT a, IntT b) +{ + return (a + b - 1) / b; +} + +/** + * @brief Provide an alignment function ie. ceil(a / b) * b + * + * @tparam IntT supposed to be only integers for now! + * + * @param[in] a dividend + * @param[in] b divisor + */ +template +constexpr CUGRAPH_OPS_HD IntT align_to(IntT a, IntT b) +{ + return ceil_div(a, b) * b; +} + +} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_dim.cuh b/cpp/src/from_cugraph_ops/device_dim.cuh new file mode 100644 index 00000000000..275d0edd485 --- /dev/null +++ b/cpp/src/from_cugraph_ops/device_dim.cuh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "device_core.hpp" + +namespace cugraph::ops::utils { + +/** get the lane id of the current thread */ +__device__ __forceinline__ int lane_id() +{ + int id; + asm("mov.s32 %0, %%laneid;" : "=r"(id)); + return id; +} + +/** + * get the flat id of the current thread (within block) + * template parameters allow to control which CTA dimensions are used + */ +template +__device__ __forceinline__ int flat_id() +{ + if (!USE_X && !USE_Y && !USE_Z) + return 0; // weird case, but if we get here, we should have 1 thread + if (!USE_X && !USE_Y && USE_Z) return threadIdx.z; + if (!USE_X && USE_Y && !USE_Z) return threadIdx.y; + if (!USE_X && USE_Y && USE_Z) return threadIdx.y + threadIdx.z * blockDim.y; + if (USE_X && !USE_Y && !USE_Z) return threadIdx.x; + if (USE_X && !USE_Y && USE_Z) return threadIdx.x + threadIdx.z * blockDim.x; + if (USE_X && USE_Y && !USE_Z) return threadIdx.x + threadIdx.y * blockDim.x; + // USE_X && USE_Y && USE_Z + return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; +} + +/** + * get the number of warps of the current block + * template parameters allow to control which CTA dimensions are used + */ +template +__device__ __forceinline__ int num_warps() +{ + if (!USE_X && !USE_Y && !USE_Z) + return 1; // weird case, but if we get here, we should have 1 thread + if (!USE_X && !USE_Y && USE_Z) return ceil_div(blockDim.z, WARP_SIZE); + if (!USE_X && USE_Y && !USE_Z) return ceil_div(blockDim.y, WARP_SIZE); + if (!USE_X && USE_Y && USE_Z) return ceil_div(blockDim.y * blockDim.z, WARP_SIZE); + if (USE_X && !USE_Y && !USE_Z) return ceil_div(blockDim.x, WARP_SIZE); + if (USE_X && !USE_Y && USE_Z) return ceil_div(blockDim.x * blockDim.z, WARP_SIZE); + if (USE_X && USE_Y && !USE_Z) return ceil_div(blockDim.x * blockDim.y, WARP_SIZE); + // USE_X && USE_Y && USE_Z + return ceil_div(blockDim.x * blockDim.y * blockDim.z, WARP_SIZE); +} + +/** + * get the warp id of the current thread + * template parameters allow to control which CTA dimensions are used + * @note: this only makes sense if the first used dimension of the CTA size + * is a multiple of WARP_SIZE. If this is not the case, use + * `flat_id<...>() / WARP_SIZE` to get the warp id of the current thread + */ +template +__device__ __forceinline__ int warp_id() +{ + if (!USE_X && !USE_Y && !USE_Z) + return 0; // weird case, but if we get here, we should have 1 thread + if (!USE_X && !USE_Y && USE_Z) return threadIdx.z / WARP_SIZE; + if (!USE_X && USE_Y && !USE_Z) return threadIdx.y / WARP_SIZE; + if (!USE_X && USE_Y && USE_Z) + return threadIdx.y / WARP_SIZE + threadIdx.z * num_warps(); + if (USE_X && !USE_Y && !USE_Z) return threadIdx.x / WARP_SIZE; + if (USE_X && !USE_Y && USE_Z) + return threadIdx.x / WARP_SIZE + threadIdx.z * num_warps(); + if (USE_X && USE_Y && !USE_Z) + return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps(); + // USE_X && USE_Y && USE_Z + return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps() + + threadIdx.z * blockDim.y * num_warps(); +} + +/** + * get the block dimension of the current executing block + * template parameters allow to control which CTA dimensions are used + */ +template +__device__ __forceinline__ int block_dim() +{ + if (!USE_X && !USE_Y && !USE_Z) + return 1; // weird case, but if we get here, we should have 1 thread + if (!USE_X && !USE_Y && USE_Z) return blockDim.z; + if (!USE_X && USE_Y && !USE_Z) return blockDim.y; + if (!USE_X && USE_Y && USE_Z) return blockDim.y * blockDim.z; + if (USE_X && !USE_Y && !USE_Z) return blockDim.x; + if (USE_X && !USE_Y && USE_Z) return blockDim.x * blockDim.z; + if (USE_X && USE_Y && !USE_Z) return blockDim.x * blockDim.y; + // USE_X && USE_Y && USE_Z + return blockDim.x * blockDim.y * blockDim.z; +} + +/** + * get the flat id of the current thread (within device/grid) + * template parameters allow to control which grid and block/CTA dimensions are used + */ +template +__device__ __forceinline__ int flat_grid_id() +{ + auto b_id = flat_id(); + auto b_dim = block_dim(); + if (!G_USE_X && !G_USE_Y && !G_USE_Z) + return 0; // weird case, but if we get here, we should have 1 thread + if (!G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.z * b_dim + b_id; + if (!G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.y * b_dim + b_id; + if (!G_USE_X && G_USE_Y && G_USE_Z) return blockIdx.y * b_dim + blockIdx.z * blockDim.z + b_id; + if (G_USE_X && !G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + b_id; + if (G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.x * b_dim + blockIdx.z * blockDim.z + b_id; + if (G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + blockIdx.y * blockDim.y + b_id; + // G_USE_X && G_USE_Y && G_USE_Z + return blockIdx.x * b_dim + blockIdx.y * blockDim.y * blockDim.z + blockIdx.z * blockDim.z + b_id; +} + +} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_smem_helper.cuh b/cpp/src/from_cugraph_ops/device_smem_helper.cuh new file mode 100644 index 00000000000..f1b5be071d9 --- /dev/null +++ b/cpp/src/from_cugraph_ops/device_smem_helper.cuh @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "device_core.hpp" + +#include +#include +#include + +namespace cugraph::ops::utils { + +// The following struct must be used to transmit the size and alignment of +// a field to the shared memory helpers below. +// By default, the alignment is just like the alignment of the original data type. +template +struct field_type { + using data_t = DataT; + static constexpr int32_t BYTES = static_cast(sizeof(DataT)); + static constexpr int32_t ALIGNMENT = ALIGN > 0 ? ALIGN : alignof(DataT); +}; + +// Imagine we have 2 fields of data in shared memory, one for ints, one for doubles. +// The intended usage of the following class in simple cases is as follows: +// 1. specify the type somewhere for both host and kernel code: +// using special_smem_name_t = smem_helper< 0, 0, field_type, field_type >; +// /* can be simplified to the following: */ +// using special_smem_name_t = smem_simple_t< int, double >; +// 2. in host code, get the size of shared memory: +// int32_t smem_sizes[] = {n_ints, n_doubles}; +// /* note: sizes are always in number of elements, not bytes */ +// /* sizes always have type `int32_t` */ +// auto size = special_smem_name_t::get_size(sizes); +// 3. in device code, call the empty constructor: +// special_smem_name_t helper {}; +// int* s_ints; +// double* s_doubles; +// int32_t smem_sizes[] = {n_ints, n_doubles}; +// helper.set_ptrs(sizes, s_ints, s_doubles); +// +// For more complicated use cases, it is often useful to create a struct overloading +// operator[] and passing that to the `get_size` or `set_ptrs` helpers. +// The struct can also be used to directly pass the size information from +// host code (launch) to the kernel, avoiding duplication of calculating sizes. +// Be aware that this overload must have a `__host__ __device__` signature. +// Here is an example struct for the above use case: +// struct sizes_t { +// int32_t n_ints, n_doubles; +// __host__ __device__ sizes_t() = delete; +// __host__ __device__ sizes_t(int32_t _n_ints, int32_t _n_doubles) : +// n_ints(_n_ints), n_doubles(_n_doubles) {} +// +// /* you may also just return int32_t here instead of const int32_t& */ +// __host__ __device__ const int32_t& operator[](int idx) const +// { +// return idx == 0 ? n_ints : n_doubles; +// } +// }; +// +// The ALIGN_INIT template parameter is important for correctness: +// By default (ALIGN_INIT=0), we assume that all alignments are powers of 2, +// and we set ALIGN_INIT to the max alignment of the fields. If you want more +// control, you can set it yourself, but we always assume that it is a multiple +// of all alignment values of the fields. +// +// The N_UNIT_FIELDS template parameters allows specifying sub-spaces +// for a given number of "units" (often warps) such that the first +// `N_UNIT_FIELDS` fields are reserved sub-spaces per unit. +// In this case, the `get_size` and `set_ptrs` methods are modified such that +// you have to specify the number of units, and for `set_ptrs` the unit ID +// as well. +// This is useful for reserving exclusive shared memory per warp for example. +// Each unit (warp) will have its sub-space (containing the `N_UNIT_FIELDS` +// fields) aligned to the initial alignment as described above. +template +class smem_helper { + public: + static constexpr size_t N_ARGS = sizeof...(FieldsT); + + protected: + static_assert(N_ARGS > 0, "smem_helper: must have at least one field type"); + static_assert(N_UNIT_FIELDS >= 0, "smem_helper: #unit fields must be non-negative"); + static_assert(N_UNIT_FIELDS <= N_ARGS, + "smem_helper: #unit fields must be smaller than #field types"); + // following static assertion for FieldsT to not be scalar types is based on + // https://stackoverflow.com/a/28253503/4134127 + template + struct bool_pack; + template + using all_true_t = std::is_same, bool_pack>; + static_assert(all_true_t::value...>::value, + "smem_helper: the given field template types must be of type `field_type` and " + "cannot be scalars"); + + template + __host__ __device__ static constexpr typename std::enable_if<(IDX < N_ARGS), int32_t>::type + max_align() + { + using f_t = typename std::tuple_element>::type; + static_assert(f_t::ALIGNMENT > 0, "field alignments must be greater than 0"); + return max_align() > f_t::ALIGNMENT ? max_align() : f_t::ALIGNMENT; + } + template + __host__ __device__ static constexpr typename std::enable_if<(IDX >= N_ARGS), int32_t>::type + max_align() + { + return -1; + } + + // this is assumed to be a multiple of all alignments + static constexpr int32_t ALIGN_BASE = ALIGN_INIT > 0 ? ALIGN_INIT : max_align<0>(); + + // here we exploit that the base pointer must be aligned to 16 bytes. + // if 16 is a multiple of ALIGN_BASE, that means we don't have any overhead. + // if ALIGN_BASE is a multiple of 16, it means that we need at most + // ALIGN_BASE - 16 extra bytes, otherwise it's ALIGN_BASE - 1 + static constexpr int32_t SIZE_OVERHEAD = 16 % ALIGN_BASE == 0 ? 0 + : ALIGN_BASE % 16 == 0 ? ALIGN_BASE - 16 + : ALIGN_BASE - 1; + + public: + // cannot easily use "= default" here for host-only code + // NOLINTNEXTLINE(modernize-use-equals-default) + __host__ __device__ smem_helper() + { +#if defined(__CUDA_ARCH__) + // must be aligned to 16 bytes on all supported architectures + // (don't have a reference for this at the moment!) + extern __shared__ uint8_t smem[]; + // align manually to `ALIGN_BASE`: this avoids the `__align(X)__` attribute + // which can cause issues if this is used in the same compilation unit + // with different types / alignments. + // In any case, the compiler/hardware cannot do a better job at providing + // an aligned pointer than we can do manually. + auto smem_aligned = align_to(reinterpret_cast(smem), uintptr_t(ALIGN_BASE)); + base_ptr_ = reinterpret_cast(smem_aligned); +#endif + } + + template + __host__ __device__ static inline typename std::enable_if<(N <= 0), int32_t>::type get_size( + const SizeT& sizes) + { + auto current_total = 0; // base pointer must be aligned to ALIGN_BASE + size_helper<1>(current_total, sizes); + return SIZE_OVERHEAD + current_total; + } + + template + __host__ __device__ static inline typename std::enable_if<(N > 0), int32_t>::type get_size( + const int32_t n_units, const SizeT& sizes) + { + auto current_total = 0; // base pointer must be aligned to all alignments + unit_size_helper<1>(current_total, sizes); + // since the unit size is aligned to ALIGN_BASE, every base pointer for + // each unit as well as the base pointer after all units is aligned to + // ALIGN_BASE: since that is a multiple of all alignments, we can safely + // continue adding the sizes afterwards + auto unit_size = align_to(current_total, ALIGN_BASE); + current_total = 0; // base pointer must be aligned to all alignments + size_helper(current_total, sizes); + return SIZE_OVERHEAD + unit_size * n_units + current_total; + } + + template + __device__ inline typename std::enable_if<(N <= 0)>::type set_ptrs( + const SizeT& sizes, typename FieldsT::data_t*&... ptrs) const + { + return ptrs_helper<1>(0, 0, 0, 0, sizes, ptrs...); + } + + template + __device__ inline typename std::enable_if<(N > 0)>::type set_ptrs( + const int32_t& unit_id, + const int32_t& n_units, + const SizeT& sizes, + typename FieldsT::data_t*&... ptrs) const + { + auto current_total = 0; // base pointer must be aligned to all alignments + unit_size_helper<1>(current_total, sizes); + // see explanation in `get_size` for what aligning to ALIGN_BASE means + auto unit_size = align_to(current_total, ALIGN_BASE); + return ptrs_helper<1>(0, unit_id, unit_size, n_units, sizes, ptrs...); + } + + protected: + template + __host__ __device__ static inline void single_size(int32_t& current_total, const SizeT& sizes) + { + using next_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT : N_ARGS - 1), + std::tuple>::type; + using this_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT - 1 : N_ARGS - 1), + std::tuple>::type; + static constexpr int32_t ALIGN = + NEXT == N_UNIT_FIELDS || NEXT >= N_ARGS ? 1 : next_field_t::ALIGNMENT; + current_total = align_to(current_total + sizes[NEXT - 1] * this_field_t::BYTES, ALIGN); + } + + // parentheses in `enable_if` here are used to help the parser understand "<>" + template + __host__ __device__ static inline typename std::enable_if<(NEXT <= N_ARGS)>::type size_helper( + int32_t& current_total, const SizeT& sizes) + { + single_size(current_total, sizes); + size_helper(current_total, sizes); + } + template + __host__ __device__ static inline typename std::enable_if<(NEXT > N_ARGS)>::type size_helper( + int32_t& /* current_total */, const SizeT& /* sizes */) + { + } + + template + __host__ __device__ static inline typename std::enable_if<(NEXT <= N_UNIT_FIELDS)>::type + unit_size_helper(int32_t& current_total, const SizeT& sizes) + { + single_size(current_total, sizes); + unit_size_helper(current_total, sizes); + } + template + __host__ __device__ static inline typename std::enable_if<(NEXT > N_UNIT_FIELDS)>::type + unit_size_helper(int32_t& /* current_total */, const SizeT& /* sizes */) + { + } + + template + __device__ inline void ptrs_helper(const int32_t& /* offset */, + const int32_t& /* unit_id */, + const int32_t& /* unit_size */, + const int32_t& /* n_units */, + const SizeT& /* sizes */) const + { + } + template + __device__ inline void ptrs_helper(const int32_t& offset, + const int32_t& unit_id, + const int32_t& unit_size, + const int32_t& n_units, + const SizeT& sizes, + PtrT*& ptr, + PtrsT*&... ptrs) const + { + // see `get_size`: base_ptr_ + u_off is always aligned to all alignments + // (whether for each individual unit or after all units) + auto u_off = NEXT <= N_UNIT_FIELDS ? unit_id * unit_size : n_units * unit_size; + ptr = reinterpret_cast(base_ptr_ + (u_off + offset)); + int32_t next_offset = offset; + if (NEXT == N_UNIT_FIELDS) + next_offset = 0; // pointer after all unit fields is aligned to all alignments + else + single_size(next_offset, sizes); + ptrs_helper(next_offset, unit_id, unit_size, n_units, sizes, ptrs...); + } + + uint8_t* base_ptr_{nullptr}; +}; + +template +using smem_simple_t = smem_helper<0, 0, field_type...>; + +template +using smem_unit_simple_t = smem_helper<0, N_UNIT_FIELDS, field_type...>; + +} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_warp_collectives.cuh b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh new file mode 100644 index 00000000000..198b3be2f12 --- /dev/null +++ b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "device_core.hpp" +#include "device_dim.cuh" +#include "macros.hpp" + +#include + +namespace cugraph::ops::utils { + +/** + * @brief get a bit mask for the `n_threads` lowest threads of a warp + * + * @param[in] n_threads number of threads in the mask + * + * @return the bit mask + */ +__host__ __device__ constexpr uint32_t low_thread_mask(int n_threads) +{ + return n_threads >= WARP_SIZE ? 0xffffffffU : (1U << n_threads) - 1U; +} + +/** + * apply a warp-wide sync (useful from Volta+ archs) + * + * @tparam NP number of participating threads + * + * @note This works on Pascal and earlier archs as well, but all threads with + * lane id <= NP must enter this function together and in convergence. + */ +template +__device__ inline void warp_sync() +{ + __syncwarp(low_thread_mask(NP)); +} + +/** + * @brief Shuffle the data inside a warp + * + * @tparam DataT the data type (currently assumed to be 4B) + * + * @param[in] val value to be shuffled + * @param[in] src_lane lane from where to shuffle + * @param[in] width lane width + * @param[in] mask mask of participating threads (Volta+) + * + * @return the shuffled data + */ +template +__device__ inline DataT shfl(DataT val, + int src_lane, + int width = WARP_SIZE, + uint32_t mask = 0xffffffffU) +{ + static_assert(CUDART_VERSION >= CUDA_VER_WARP_SHFL, + "Expected CUDA >= 9 for warp synchronous shuffle"); + return __shfl_sync(mask, val, src_lane, width); +} + +/** + * @brief Warp-level sum reduction + * + * @tparam DataT data type + * @tparam NP number of participating threads. + * must be a power of 2 and at most warp size + * + * @param[in] val input value + * + * @return only the lane0 will contain valid reduced result + * + * @note Why not cub? Because cub doesn't seem to allow working with arbitrary + * number of warps in a block. + * + * @note All threads with lane id <= NP must enter this function together + * + * TODO(mjoux) Expand this to support arbitrary reduction ops + */ +template +__device__ inline DataT warp_reduce(DataT val) +{ + static constexpr uint32_t MASK = low_thread_mask(NP); + CUGRAPH_OPS_UNROLL + for (int i = NP / 2; i > 0; i >>= 1) { + DataT tmp = shfl(val, lane_id() + i, NP, MASK); + val += tmp; + } + return val; +} + +} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/macros.hpp b/cpp/src/from_cugraph_ops/macros.hpp new file mode 100644 index 00000000000..0ff08af0b1a --- /dev/null +++ b/cpp/src/from_cugraph_ops/macros.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#define CUGRAPH_OPS_STRINGIFY_DETAIL(x) #x +#define CUGRAPH_OPS_STRINGIFY(x) CUGRAPH_OPS_STRINGIFY_DETAIL(x) + +#define CUGRAPH_OPS_UNROLL _Pragma("unroll") +#if defined(__clang__) && defined(__CUDA__) +// clang wants pragma unroll without parentheses +#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll n)) +#else +// nvcc / nvrtc want pragma unroll with parentheses +#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll(n))) +#endif + +#if defined(__clang__) +#define CUGRAPH_OPS_CONSTEXPR_D constexpr +#else +#define CUGRAPH_OPS_CONSTEXPR_D constexpr __device__ +#endif + +#if defined(__CUDACC__) || defined(__CUDA__) +#define CUGRAPH_OPS_HD __host__ __device__ +#else +#define CUGRAPH_OPS_HD +#endif + +// The CUGRAPH_OPS_KERNEL specificies that a kernel has hidden visibility +// +// cugraph-ops needs to ensure that the visibility of its CUGRAPH_OPS_KERNEL function +// templates have hidden visibility ( default is weak visibility). +// +// When kernels have weak visibility it means that if two dynamic libraries +// both contain identical instantiations of a kernel/template, then the linker +// will discard one of the two instantiations and use only one of them. +// +// Do to unique requirements of how the CUDA works this de-deduplication +// can lead to the wrong kernels being called ( SM version being wrong ), +// silently no kernel being called at all, or cuda runtime errors being +// thrown. +// +// https://github.com/rapidsai/raft/issues/1722 +#ifndef CUGRAPH_OPS_KERNEL +#define CUGRAPH_OPS_KERNEL __global__ static +#endif diff --git a/cpp/src/from_cugraph_ops/sampling.hpp b/cpp/src/from_cugraph_ops/sampling.hpp new file mode 100644 index 00000000000..5663b8d9c03 --- /dev/null +++ b/cpp/src/from_cugraph_ops/sampling.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +// FIXME: This is only here for the prims... +// Need to look how Seunghwa fixed this in his PR +#include + +#include + +#include + +#include + +namespace cugraph::legacy::ops::graph { + +/** + * @brief Generate indexes given population sizes and a sample size, + * with or without replacement + * + * @param[out] index The (dense) index matrix. [on device] + * [dim = `n_sizes x sample_size`] + * In case `replace` is `false`, this may contain + * `ops::graph::INVALID_ID` + * if no index could be generated. + * @param[inout] rng RAFT RngState state object + * @param[in] sizes Input array of population sizes [on device] + * [len = `n_sizes`] + * @param[in] n_sizes number of sizes to sample from. + * @param[in] sample_size max number of indexes to be sampled per element + * in `sizes`. Assumed to be <= 384 at the moment. + * @param[in] replace If `true`, sample with replacement, otherwise + * without replacement. + * @param[in] stream cuda stream + * + @{ + */ +void get_sampling_index(int32_t* index, + raft::random::RngState& rng, + const int32_t* sizes, + int32_t n_sizes, + int32_t sample_size, + bool replace, + cudaStream_t stream); +void get_sampling_index(int64_t* index, + raft::random::RngState& rng, + const int64_t* sizes, + int64_t n_sizes, + int32_t sample_size, + bool replace, + cudaStream_t stream); + +} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/from_cugraph_ops/sampling_index.cu b/cpp/src/from_cugraph_ops/sampling_index.cu new file mode 100644 index 00000000000..fb1f4ac3f1e --- /dev/null +++ b/cpp/src/from_cugraph_ops/sampling_index.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#include "sampling.hpp" +#include "sampling_index.cuh" + +namespace cugraph::legacy::ops::graph { + +void get_sampling_index(int32_t* index, + raft::random::RngState& rng, + const int32_t* sizes, + int32_t n_sizes, + int32_t sample_size, + bool replace, + cudaStream_t stream) +{ + get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream); +} + +void get_sampling_index(int64_t* index, + raft::random::RngState& rng, + const int64_t* sizes, + int64_t n_sizes, + int32_t sample_size, + bool replace, + cudaStream_t stream) +{ + get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream); +} + +} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/from_cugraph_ops/sampling_index.cuh b/cpp/src/from_cugraph_ops/sampling_index.cuh new file mode 100644 index 00000000000..9ac574315bb --- /dev/null +++ b/cpp/src/from_cugraph_ops/sampling_index.cuh @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + */ + +#pragma once + +#include "algo_R.cuh" +#include "sampling.hpp" + +#include +#include + +#include + +namespace cugraph::legacy::ops::graph { + +namespace utils = cugraph::ops::utils; + +template +using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>; + +template +CUGRAPH_OPS_KERNEL void index_replace_kernel(raft::random::DeviceState rng_state, + IdxT* index, + const IdxT* sizes, + IdxT n_sizes, + int sample_size) +{ + using rand_t = std::make_unsigned_t; + // a warp-wide implementation. + auto lane = cugraph::ops::utils::lane_id(); + auto warp = utils::warp_id(); // 1D block with X dim + auto n_warps = utils::num_warps(); // 1D block with X dim + auto row_id = warp + static_cast(blockIdx.x) * IdxT{n_warps}; + if (row_id >= n_sizes) return; + // 1. load population size (once per warp) + IdxT size = IdxT{0}; + if (lane == 0) size = sizes[row_id]; + + // 2. shuffle it to all threads in warp + size = utils::shfl(size, 0); + + // 3. check valid size: possible early-out + if (size <= 0) { + CUGRAPH_OPS_UNROLL + for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { + index[row_id * IdxT{sample_size} + IdxT{i}] = cugraph::invalid_idx::value; + } + return; + } + + // 4. every thread generates its indexes + auto flat_id = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + GenT gen(rng_state, flat_id); + raft::random::UniformIntDistParams int_params{}; + int_params.start = IdxT{0}; + int_params.end = size; + int_params.diff = static_cast(size); + CUGRAPH_OPS_UNROLL + for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { + IdxT idx = IdxT{0}; + raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */); + + // 5. output index + index[row_id * IdxT{sample_size} + IdxT{i}] = idx; + } +} + +template +void get_sampling_index_replace(IdxT* index, + raft::random::RngState& rng, + const IdxT* sizes, + IdxT n_sizes, + int32_t sample_size, + cudaStream_t stream) +{ + // keep thread per block fairly low since we can expect sample_size < warp_size + // thus we want to have as many blocks as possible to increase parallelism + static constexpr int TPB = 128; + static constexpr int N_WARPS = TPB / utils::WARP_SIZE; + auto n_blks = utils::ceil_div(n_sizes, N_WARPS); + RAFT_CALL_RNG_FUNC( + rng, (index_replace_kernel<<>>), index, sizes, n_sizes, sample_size); + auto thread_rs = utils::ceil_div(IdxT{sample_size}, utils::WARP_SIZE); + rng.advance(static_cast(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t)); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +CUGRAPH_OPS_KERNEL void index_algo_r_kernel(raft::random::DeviceState rng_state, + IdxT* index, + const IdxT* sizes, + IdxT n_sizes, + int sample_size) +{ + using rand_t = std::make_unsigned_t; + // a warp-wide implementation. + auto lane = utils::lane_id(); + auto warp = utils::warp_id(); // 1D block with X dim + auto row_id = warp + static_cast(blockIdx.x) * IdxT{N_WARPS}; + if (row_id >= n_sizes) return; + IdxT* s_idx; + smem_algo_r_t smem{}; + int32_t smem_sizes[] = {sample_size}; + smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx); + // 1. load population size (once per warp) + IdxT size = IdxT{0}; + if (lane == 0) size = sizes[row_id]; + + // 2. shuffle it to all threads in warp + size = utils::shfl(size, 0); + + // 3. Get algo R indexes per warp + cugraph::ops::graph::warp_algo_r_index( + s_idx, size, IdxT{0}, sample_size, rng_state); + + CUGRAPH_OPS_UNROLL + for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { + // 4. output index + // still need to check if the index is actually valid + auto idx = s_idx[i]; + index[row_id * IdxT{sample_size} + IdxT{i}] = + idx >= size ? cugraph::invalid_idx::value : idx; + } +} + +template +void get_sampling_index_reservoir(IdxT* index, + raft::random::RngState& rng, + const IdxT* sizes, + IdxT n_sizes, + int32_t sample_size, + cudaStream_t stream) +{ + // same TPB as in algo R: increased SM occupancy is most important here + static constexpr int TPB = 512; + static constexpr int N_WARPS = TPB / utils::WARP_SIZE; + auto n_blks = utils::ceil_div(n_sizes, N_WARPS); + int32_t smem_sizes[] = {sample_size}; + size_t smem_size = smem_algo_r_t::get_size(N_WARPS, smem_sizes); + RAFT_CALL_RNG_FUNC(rng, + (index_algo_r_kernel<<>>), + index, + sizes, + n_sizes, + sample_size); + auto thread_rs = utils::ceil_div( + std::max(IdxT{0}, std::min(std::numeric_limits::max(), n_sizes) - IdxT{sample_size}), + utils::WARP_SIZE); + rng.advance(static_cast(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t)); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void get_sampling_index_impl(IdxT* index, + raft::random::RngState& rng, + const IdxT* sizes, + IdxT n_sizes, + int32_t sample_size, + bool replace, + cudaStream_t stream) +{ + if (replace) { + get_sampling_index_replace(index, rng, sizes, n_sizes, sample_size, stream); + } else { + get_sampling_index_reservoir(index, rng, sizes, n_sizes, sample_size, stream); + } +} + +} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/generators/erdos_renyi_generator.cuh b/cpp/src/generators/erdos_renyi_generator.cuh index cd461ee1aa2..10573ddb0d0 100644 --- a/cpp/src/generators/erdos_renyi_generator.cuh +++ b/cpp/src/generators/erdos_renyi_generator.cuh @@ -40,6 +40,11 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle, vertex_t base_vertex_id, uint64_t seed) { + // NOTE: + // https://networkx.org/documentation/stable/_modules/networkx/generators/random_graphs.html#fast_gnp_random_graph + // identifies a faster algorithm that I think would be very efficient on the GPU. I believe we + // could just compute lr/lp in that code for a batch of values, use prefix sums to generate edge + // ids and then convert the generated values to a batch of edges. CUGRAPH_EXPECTS(num_vertices < std::numeric_limits::max(), "Implementation cannot support specified value"); @@ -88,6 +93,11 @@ generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle, uint64_t seed) { CUGRAPH_FAIL("Not implemented"); + + // To implement: + // Use sampling function to select `m` unique edge ids from the + // (num_vertices ^ 2) possible edges. Convert these to vertex + // ids. } } // namespace cugraph diff --git a/cpp/src/layout/legacy/barnes_hut.cuh b/cpp/src/layout/legacy/barnes_hut.cuh index fa6d3816417..fdd57c7772d 100644 --- a/cpp/src/layout/legacy/barnes_hut.cuh +++ b/cpp/src/layout/legacy/barnes_hut.cuh @@ -19,7 +19,6 @@ #include "bh_kernels.cuh" #include "converters/legacy/COOtoCSR.cuh" #include "fa2_kernels.cuh" -#include "utilities/graph_utils.cuh" #include "utils.hpp" #include diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh index 33e7841a380..195889eebfb 100644 --- a/cpp/src/layout/legacy/fa2_kernels.cuh +++ b/cpp/src/layout/legacy/fa2_kernels.cuh @@ -17,7 +17,9 @@ #pragma once #define restrict __restrict__ -#include "utilities/graph_utils.cuh" +// From old graph_utils.cuh +#define CUDA_MAX_BLOCKS 65535 +#define CUDA_MAX_KERNEL_THREADS 256 // kernel will launch at most 256 threads per block namespace cugraph { namespace detail { diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index c4c071b8f2c..7d4750c0554 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "from_cugraph_ops/sampling.hpp" #include "prims/detail/partition_v_frontier.cuh" #include "prims/detail/transform_v_frontier_e.cuh" #include "prims/property_op_utils.cuh" @@ -33,9 +34,6 @@ #include #include -#ifndef NO_CUGRAPH_OPS -#include -#endif #include #include @@ -639,7 +637,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( auto mid_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; if (mid_partition_size > 0) { // FIXME: tmp_degrees & tmp_nbr_indices can be avoided if we customize - // cugraph::ops::get_sampling_index + // cugraph::legacy::ops::get_sampling_index rmm::device_uvector tmp_degrees(mid_partition_size, handle.get_stream()); rmm::device_uvector tmp_nbr_indices(mid_partition_size * K, handle.get_stream()); thrust::gather(handle.get_thrust_policy(), @@ -647,13 +645,13 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( frontier_indices.begin() + frontier_partition_offsets[2], frontier_degrees.begin(), tmp_degrees.begin()); - cugraph::ops::graph::get_sampling_index(tmp_nbr_indices.data(), - rng_state, - tmp_degrees.data(), - mid_partition_size, - static_cast(K), - false, - handle.get_stream()); + cugraph::legacy::ops::graph::get_sampling_index(tmp_nbr_indices.data(), + rng_state, + tmp_degrees.data(), + mid_partition_size, + static_cast(K), + false, + handle.get_stream()); thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), @@ -736,7 +734,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( } if (retry_segment_indices) { - cugraph::ops::graph::get_sampling_index( + cugraph::legacy::ops::graph::get_sampling_index( (*retry_nbr_indices).data(), rng_state, (*retry_degrees).begin(), @@ -752,7 +750,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( segment_frontier_degree_first, segment_frontier_degree_first + num_segments, tmp_degrees.begin()); - cugraph::ops::graph::get_sampling_index( + cugraph::legacy::ops::graph::get_sampling_index( tmp_nbr_indices.data(), rng_state, tmp_degrees.data(), @@ -1626,13 +1624,13 @@ uniform_sample_and_compute_local_nbr_indices( if (with_replacement) { if (frontier_degrees.size() > 0) { nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); - cugraph::ops::graph::get_sampling_index(nbr_indices.data(), - rng_state, - frontier_degrees.data(), - static_cast(frontier_degrees.size()), - static_cast(K), - with_replacement, - handle.get_stream()); + cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(), + rng_state, + frontier_degrees.data(), + static_cast(frontier_degrees.size()), + static_cast(K), + with_replacement, + handle.get_stream()); frontier_degrees.resize(0, handle.get_stream()); frontier_degrees.shrink_to_fit(handle.get_stream()); } diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 015a9c683f1..03514e52e6e 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "from_cugraph_ops/sampling.hpp" #include "prims/detail/sample_and_compute_local_nbr_indices.cuh" #include "prims/property_op_utils.cuh" @@ -30,9 +31,6 @@ #include #include -#ifndef NO_CUGRAPH_OPS -#include -#endif #include #include diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh new file mode 100644 index 00000000000..93bb03077bc --- /dev/null +++ b/cpp/src/sampling/negative_sampling_impl.cuh @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "prims/reduce_v.cuh" +#include "prims/update_edge_src_dst_property.cuh" +#include "utilities/collect_comm.cuh" + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { + +namespace detail { + +template +std::tuple>, + std::optional>> +normalize_biases(raft::handle_t const& handle, + graph_view_t const& graph_view, + raft::device_span biases) +{ + std::optional> normalized_biases{std::nullopt}; + std::optional> gpu_biases{std::nullopt}; + + // Need to normalize the biases + normalized_biases = + std::make_optional>(biases.size(), handle.get_stream()); + + weight_t sum = + thrust::reduce(handle.get_thrust_policy(), biases.begin(), biases.end(), weight_t{0}); + + thrust::transform(handle.get_thrust_policy(), + biases.begin(), + biases.end(), + normalized_biases->begin(), + divider_t{sum}); + + thrust::inclusive_scan(handle.get_thrust_policy(), + normalized_biases->begin(), + normalized_biases->end(), + normalized_biases->begin()); + + if constexpr (multi_gpu) { + rmm::device_scalar d_sum(sum, handle.get_stream()); + + gpu_biases = cugraph::device_allgatherv( + handle, handle.get_comms(), raft::device_span{d_sum.data(), d_sum.size()}); + + weight_t aggregate_sum = thrust::reduce( + handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), weight_t{0}); + + // FIXME: https://github.com/rapidsai/raft/issues/2400 results in the possibility + // that 1 can appear as a random floating point value. We're going to use + // thrust::upper_bound to assign random values to GPUs, we need the value 1.0 to + // be part of the upper-most range. We'll compute the last non-zero value in the + // gpu_biases array here and below we will fill it with a value larger than 1.0 + size_t trailing_zeros = thrust::distance( + thrust::make_reverse_iterator(gpu_biases->end()), + thrust::find_if(handle.get_thrust_policy(), + thrust::make_reverse_iterator(gpu_biases->end()), + thrust::make_reverse_iterator(gpu_biases->begin()), + [] __device__(weight_t bias) { return bias > weight_t{0}; })); + + thrust::transform(handle.get_thrust_policy(), + gpu_biases->begin(), + gpu_biases->end(), + gpu_biases->begin(), + divider_t{aggregate_sum}); + + thrust::inclusive_scan( + handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), gpu_biases->begin()); + + // FIXME: conclusion of above. Using 1.1 since it is > 1.0 and easy to type + thrust::copy_n(handle.get_thrust_policy(), + thrust::make_constant_iterator(1.1), + trailing_zeros + 1, + gpu_biases->begin() + gpu_biases->size() - trailing_zeros - 1); + } + + return std::make_tuple(std::move(normalized_biases), std::move(gpu_biases)); +} + +template +rmm::device_uvector create_local_samples( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> const& normalized_biases, + std::optional> const& gpu_biases, + size_t samples_in_this_batch) +{ + rmm::device_uvector samples(0, handle.get_stream()); + + if (normalized_biases) { + size_t samples_to_generate{samples_in_this_batch}; + std::vector sample_count_from_each_gpu; + + rmm::device_uvector position(0, handle.get_stream()); + + if constexpr (multi_gpu) { + // Determine how many vertices are generated on each GPU + auto const comm_size = handle.get_comms().get_size(); + auto const comm_rank = handle.get_comms().get_rank(); + + sample_count_from_each_gpu.resize(comm_size); + + rmm::device_uvector gpu_counts(comm_size, handle.get_stream()); + position.resize(samples_in_this_batch, handle.get_stream()); + + thrust::fill(handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), size_t{0}); + thrust::sequence(handle.get_thrust_policy(), position.begin(), position.end()); + + rmm::device_uvector random_values(samples_in_this_batch, handle.get_stream()); + detail::uniform_random_fill(handle.get_stream(), + random_values.data(), + random_values.size(), + weight_t{0}, + weight_t{1}, + rng_state); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(random_values.begin(), position.begin()), + thrust::make_zip_iterator(random_values.end(), position.end())); + + thrust::upper_bound(handle.get_thrust_policy(), + random_values.begin(), + random_values.end(), + gpu_biases->begin(), + gpu_biases->end(), + gpu_counts.begin()); + + thrust::adjacent_difference( + handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), gpu_counts.begin()); + + std::vector tx_counts(gpu_counts.size()); + std::fill(tx_counts.begin(), tx_counts.end(), size_t{1}); + + rmm::device_uvector d_sample_count_from_each_gpu(0, handle.get_stream()); + + std::tie(d_sample_count_from_each_gpu, std::ignore) = + shuffle_values(handle.get_comms(), gpu_counts.begin(), tx_counts, handle.get_stream()); + + samples_to_generate = thrust::reduce(handle.get_thrust_policy(), + d_sample_count_from_each_gpu.begin(), + d_sample_count_from_each_gpu.end(), + size_t{0}); + + raft::update_host(sample_count_from_each_gpu.data(), + d_sample_count_from_each_gpu.data(), + d_sample_count_from_each_gpu.size(), + handle.get_stream()); + } + + // Generate samples + // FIXME: We could save this memory if we had an iterator that + // generated random values. + rmm::device_uvector random_values(samples_to_generate, handle.get_stream()); + samples.resize(samples_to_generate, handle.get_stream()); + detail::uniform_random_fill(handle.get_stream(), + random_values.data(), + random_values.size(), + weight_t{0}, + weight_t{1}, + rng_state); + + thrust::transform( + handle.get_thrust_policy(), + random_values.begin(), + random_values.end(), + samples.begin(), + [biases = + raft::device_span{normalized_biases->data(), normalized_biases->size()}, + offset = graph_view.local_vertex_partition_range_first()] __device__(weight_t r) { + size_t result = + offset + + static_cast(thrust::distance( + biases.begin(), thrust::lower_bound(thrust::seq, biases.begin(), biases.end(), r))); + + // FIXME: https://github.com/rapidsai/raft/issues/2400 + // results in the possibility that 1 can appear as a + // random floating point value, which results in the sampling + // algorithm below generating a value that's OOB. + if (result == (offset + biases.size())) --result; + + return result; + }); + + // Shuffle them back + if constexpr (multi_gpu) { + std::tie(samples, std::ignore) = shuffle_values( + handle.get_comms(), samples.begin(), sample_count_from_each_gpu, handle.get_stream()); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(position.begin(), samples.begin()), + thrust::make_zip_iterator(position.end(), samples.begin())); + } + } else { + samples.resize(samples_in_this_batch, handle.get_stream()); + + // Uniformly select a vertex from any GPU + detail::uniform_random_fill(handle.get_stream(), + samples.data(), + samples.size(), + vertex_t{0}, + graph_view.number_of_vertices(), + rng_state); + } + + return samples; +} + +} // namespace detail + +template +std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_biases, + std::optional> dst_biases, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check) +{ + rmm::device_uvector src(0, handle.get_stream()); + rmm::device_uvector dst(0, handle.get_stream()); + + // Optimistically assume we can do this in one pass + size_t samples_in_this_batch = num_samples; + + // Normalize the biases and (for MG) determine how the biases are + // distributed across the GPUs. + std::optional> normalized_src_biases{std::nullopt}; + std::optional> gpu_src_biases{std::nullopt}; + std::optional> normalized_dst_biases{std::nullopt}; + std::optional> gpu_dst_biases{std::nullopt}; + + if (src_biases) + std::tie(normalized_src_biases, gpu_src_biases) = + detail::normalize_biases(handle, graph_view, *src_biases); + + if (dst_biases) + std::tie(normalized_dst_biases, gpu_dst_biases) = + detail::normalize_biases(handle, graph_view, *dst_biases); + + while (samples_in_this_batch > 0) { + if constexpr (multi_gpu) { + auto const comm_size = handle.get_comms().get_size(); + auto const comm_rank = handle.get_comms().get_rank(); + + samples_in_this_batch = + (samples_in_this_batch / static_cast(comm_size)) + + (static_cast(comm_rank) < (samples_in_this_batch % static_cast(comm_size)) + ? 1 + : 0); + } + + auto batch_src = create_local_samples( + handle, rng_state, graph_view, normalized_src_biases, gpu_src_biases, samples_in_this_batch); + auto batch_dst = create_local_samples( + handle, rng_state, graph_view, normalized_dst_biases, gpu_dst_biases, samples_in_this_batch); + + if constexpr (multi_gpu) { + auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts(); + + std::tie(batch_src, batch_dst, std::ignore, std::ignore, std::ignore, std::ignore) = + detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning( + handle, + std::move(batch_src), + std::move(batch_dst), + std::nullopt, + std::nullopt, + std::nullopt, + vertex_partition_range_lasts); + } + + if (remove_existing_edges) { + auto has_edge_flags = + graph_view.has_edge(handle, + raft::device_span{batch_src.data(), batch_src.size()}, + raft::device_span{batch_dst.data(), batch_dst.size()}, + do_expensive_check); + + auto begin_iter = thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()); + auto new_end = thrust::remove_if(handle.get_thrust_policy(), + begin_iter, + begin_iter + batch_src.size(), + has_edge_flags.begin(), + thrust::identity()); + + batch_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream()); + batch_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream()); + } + + if (remove_duplicates) { + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), + thrust::make_zip_iterator(batch_src.end(), batch_dst.end())); + + auto new_end = thrust::unique(handle.get_thrust_policy(), + thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), + thrust::make_zip_iterator(batch_src.end(), batch_dst.end())); + + size_t new_size = + thrust::distance(thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), new_end); + + if (src.size() > 0) { + rmm::device_uvector new_src(src.size() + new_size, handle.get_stream()); + rmm::device_uvector new_dst(dst.size() + new_size, handle.get_stream()); + + thrust::merge(handle.get_thrust_policy(), + thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), + new_end, + thrust::make_zip_iterator(src.begin(), dst.begin()), + thrust::make_zip_iterator(src.end(), dst.end()), + thrust::make_zip_iterator(new_src.begin(), new_dst.begin())); + + new_end = thrust::unique(handle.get_thrust_policy(), + thrust::make_zip_iterator(new_src.begin(), new_dst.begin()), + thrust::make_zip_iterator(new_src.end(), new_dst.end())); + + new_size = + thrust::distance(thrust::make_zip_iterator(new_src.begin(), new_dst.begin()), new_end); + + src = std::move(new_src); + dst = std::move(new_dst); + } else { + src = std::move(batch_src); + dst = std::move(batch_dst); + } + + src.resize(new_size, handle.get_stream()); + dst.resize(new_size, handle.get_stream()); + } else if (src.size() > 0) { + size_t current_end = src.size(); + + src.resize(src.size() + batch_src.size(), handle.get_stream()); + dst.resize(dst.size() + batch_dst.size(), handle.get_stream()); + + thrust::copy(handle.get_thrust_policy(), + thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), + thrust::make_zip_iterator(batch_src.end(), batch_dst.end()), + thrust::make_zip_iterator(src.begin(), dst.begin()) + current_end); + } else { + src = std::move(batch_src); + dst = std::move(batch_dst); + } + + if (exact_number_of_samples) { + size_t current_sample_size = src.size(); + if constexpr (multi_gpu) { + current_sample_size = cugraph::host_scalar_allreduce( + handle.get_comms(), current_sample_size, raft::comms::op_t::SUM, handle.get_stream()); + } + + // FIXME: We could oversample and discard the unnecessary samples + // to reduce the number of iterations in the outer loop, but it seems like + // exact_number_of_samples is an edge case not worth optimizing for at this time. + samples_in_this_batch = num_samples - current_sample_size; + } else { + samples_in_this_batch = 0; + } + } + + src.shrink_to_fit(handle.get_stream()); + dst.shrink_to_fit(handle.get_stream()); + + return std::make_tuple(std::move(src), std::move(dst)); +} + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e32.cu b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu new file mode 100644 index 00000000000..ce54d54d319 --- /dev/null +++ b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e64.cu b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu new file mode 100644 index 00000000000..af4c28c0f1a --- /dev/null +++ b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_mg_v64_e64.cu b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu new file mode 100644 index 00000000000..c5691fb4644 --- /dev/null +++ b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e32.cu b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu new file mode 100644 index 00000000000..3712414e4ec --- /dev/null +++ b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e64.cu b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu new file mode 100644 index 00000000000..c66c31a4258 --- /dev/null +++ b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/negative_sampling_sg_v64_e64.cu b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu new file mode 100644 index 00000000000..e4fc50890e4 --- /dev/null +++ b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "negative_sampling_impl.cuh" + +#include +#include + +namespace cugraph { + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +template std::tuple, rmm::device_uvector> negative_sampling( + raft::handle_t const& handle, + raft::random::RngState& rng_state, + graph_view_t const& graph_view, + std::optional> src_bias, + std::optional> dst_bias, + size_t num_samples, + bool remove_duplicates, + bool remove_existing_edges, + bool exact_number_of_samples, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh index 3b0bc15df93..0b1d9dcdb56 100644 --- a/cpp/src/sampling/random_walks.cuh +++ b/cpp/src/sampling/random_walks.cuh @@ -18,8 +18,6 @@ // #pragma once -#include "utilities/graph_utils.cuh" - #include #include #include diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp index 45cc1e54cb4..2c5658b32a5 100644 --- a/cpp/src/sampling/rw_traversals.hpp +++ b/cpp/src/sampling/rw_traversals.hpp @@ -18,8 +18,6 @@ // #pragma once -#include "utilities/graph_utils.cuh" - #include #include diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 5371d53bcf0..f925a142737 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -803,7 +803,7 @@ graph_view_t edge_srcs, raft::device_span edge_dsts, - bool do_expensive_check) + bool do_expensive_check) const { CUGRAPH_EXPECTS( edge_srcs.size() == edge_dsts.size(), @@ -883,7 +883,7 @@ graph_view_t edge_srcs, raft::device_span edge_dsts, - bool do_expensive_check) + bool do_expensive_check) const { CUGRAPH_EXPECTS( edge_srcs.size() == edge_dsts.size(), diff --git a/cpp/src/structure/legacy/graph.cu b/cpp/src/structure/legacy/graph.cu index 7e1238e1558..a504125080b 100644 --- a/cpp/src/structure/legacy/graph.cu +++ b/cpp/src/structure/legacy/graph.cu @@ -14,17 +14,18 @@ * limitations under the License. */ -#include "utilities/graph_utils.cuh" - #include #include +#include #include #include +#include #include #include +#include namespace { @@ -69,15 +70,40 @@ namespace legacy { template void GraphViewBase::get_vertex_identifiers(VT* identifiers) const { - cugraph::detail::sequence(number_of_vertices, identifiers); + thrust::sequence(thrust::device, + thrust::device_pointer_cast(identifiers), + thrust::device_pointer_cast(identifiers + number_of_vertices), + VT{0}); + RAFT_CHECK_CUDA(nullptr); } +// FIXME: Need to get rid of this function... still used in python template void GraphCompressedSparseBaseView::get_source_indices(VT* src_indices) const { CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified"); - cugraph::detail::offsets_to_indices( - offsets, GraphViewBase::number_of_vertices, src_indices); + rmm::cuda_stream_view stream_view; + + raft::device_span indices_span(src_indices, GraphViewBase::number_of_edges); + + if (indices_span.size() > 0) { + thrust::fill(rmm::exec_policy(stream_view), indices_span.begin(), indices_span.end(), VT{0}); + + thrust::for_each(rmm::exec_policy(stream_view), + offsets + 1, + offsets + GraphViewBase::number_of_vertices, + [indices_span] __device__(ET offset) { + if (offset < static_cast(indices_span.size())) { + cuda::atomic_ref atomic_counter( + indices_span.data()[offset]); + atomic_counter.fetch_add(VT{1}, cuda::std::memory_order_relaxed); + } + }); + thrust::inclusive_scan(rmm::exec_policy(stream_view), + indices_span.begin(), + indices_span.end(), + indices_span.begin()); + } } template @@ -152,6 +178,4 @@ void GraphCompressedSparseBaseView::degree(ET* degree, DegreeDirecti } // namespace legacy } // namespace cugraph -#include "utilities/eidir_graph_utils.hpp" - #include diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp index e6857a5beda..d7502b3f6da 100644 --- a/cpp/src/structure/select_random_vertices_impl.hpp +++ b/cpp/src/structure/select_random_vertices_impl.hpp @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "from_cugraph_ops/sampling.hpp" #include #include @@ -30,10 +31,6 @@ #include #include -#ifndef NO_CUGRAPH_OPS -#include -#endif - #include #include #include diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh index 3790c0057cb..40030e2e39c 100644 --- a/cpp/src/traversal/extract_bfs_paths_impl.cuh +++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh @@ -17,7 +17,6 @@ #include "detail/graph_partition_utils.cuh" #include "utilities/collect_comm.cuh" -#include "utilities/graph_utils.cuh" #include #include diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp deleted file mode 100644 index abf026cbbfe..00000000000 --- a/cpp/src/utilities/eidecl_graph_utils.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cugraph { -namespace detail { - -extern template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*); -extern template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*); -extern template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*); -extern template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*); -extern template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*); -extern template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*); - -extern template void offsets_to_indices(int const*, int, int*); -extern template void offsets_to_indices(long const*, int, int*); -extern template void offsets_to_indices(long const*, long, long*); - -extern template __attribute__((visibility("hidden"))) __global__ void -offsets_to_indices_kernel(int const*, int, int*); -extern template __attribute__((visibility("hidden"))) __global__ void -offsets_to_indices_kernel(long const*, int, int*); -extern template __attribute__((visibility("hidden"))) __global__ void -offsets_to_indices_kernel(long const*, long, long*); - -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp deleted file mode 100644 index ba06c6f56ea..00000000000 --- a/cpp/src/utilities/eidir_graph_utils.hpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cugraph { -namespace detail { - -template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*); -template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*); -template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*); -template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*); -template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*); -template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*); - -template void offsets_to_indices(int32_t const*, int32_t, int32_t*); -template void offsets_to_indices(int64_t const*, int32_t, int32_t*); -template void offsets_to_indices(int64_t const*, int64_t, int64_t*); - -template __global__ __attribute__((visibility("hidden"))) void -offsets_to_indices_kernel(int32_t const*, int32_t, int32_t*); -template __global__ __attribute__((visibility("hidden"))) void -offsets_to_indices_kernel(int64_t const*, int32_t, int32_t*); -template __global__ __attribute__((visibility("hidden"))) void -offsets_to_indices_kernel(int64_t const*, int64_t, int64_t*); - -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh deleted file mode 100644 index 0b257e7abde..00000000000 --- a/cpp/src/utilities/graph_utils.cuh +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -// Interanl helper functions -// Author: Alex Fender afender@nvidia.com -#pragma once - -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace cugraph { -namespace detail { - -// #define DEBUG 1 -#define CUDA_MAX_BLOCKS 65535 -#define CUDA_MAX_KERNEL_THREADS 256 // kernel will launch at most 256 threads per block -#define US - -template -__inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const* ind, value_t const* w) -{ - count_t i, j, mn; - value_t v, last; - value_t sum = 0.0; - bool valid; - - // Parallel prefix sum (using __shfl) - mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); // n in multiple of blockDim.x - for (i = threadIdx.x; i < mn; i += blockDim.x) { - // All threads (especially the last one) must always participate - // in the shfl instruction, otherwise their sum will be undefined. - // So, the loop stopping condition is based on multiple of n in loop increments, - // so that all threads enter into the loop and inside we make sure we do not - // read out of bounds memory checking for the actual size n. - - // check if the thread is valid - valid = i < n; - - // Notice that the last thread is used to propagate the prefix sum. - // For all the threads, in the first iteration the last is 0, in the following - // iterations it is the value at the last thread of the previous iterations. - - // get the value of the last thread - last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - sum = (valid) ? w[ind[i]] : 0.0; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (j = 1; j < blockDim.x; j *= 2) { - v = __shfl_up_sync(raft::warp_full_mask(), sum, j, blockDim.x); - if (threadIdx.x >= j) sum += v; - } - // shift by last - sum += last; - // notice that no __threadfence or __syncthreads are needed in this implementation - } - // get the value of the last thread (to all threads) - last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x); - - return last; -} - -// axpy -template -struct axpy_functor : public thrust::binary_function { - const T a; - axpy_functor(T _a) : a(_a) {} - __host__ __device__ T operator()(const T& x, const T& y) const { return a * x + y; } -}; - -template -void axpy(size_t n, T a, T* x, T* y) -{ - rmm::cuda_stream_view stream_view; - thrust::transform(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y), - axpy_functor(a)); - RAFT_CHECK_CUDA(stream_view.value()); -} - -// norm -template -struct square { - __host__ __device__ T operator()(const T& x) const { return x * x; } -}; - -template -T nrm2(size_t n, T* x) -{ - rmm::cuda_stream_view stream_view; - T init = 0; - T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - square(), - init, - thrust::plus())); - RAFT_CHECK_CUDA(stream_view.value()); - return result; -} - -template -T nrm1(size_t n, T* x) -{ - rmm::cuda_stream_view stream_view; - T result = thrust::reduce(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n)); - RAFT_CHECK_CUDA(stream_view.value()); - return result; -} - -template -void scal(size_t n, T val, T* x) -{ - rmm::cuda_stream_view stream_view; - thrust::transform(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::make_constant_iterator(val), - thrust::device_pointer_cast(x), - thrust::multiplies()); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -void addv(size_t n, T val, T* x) -{ - rmm::cuda_stream_view stream_view; - thrust::transform(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::make_constant_iterator(val), - thrust::device_pointer_cast(x), - thrust::plus()); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -void fill(size_t n, T* x, T value) -{ - rmm::cuda_stream_view stream_view; - thrust::fill(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - value); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -void scatter(size_t n, T* src, T* dst, M* map) -{ - rmm::cuda_stream_view stream_view; - thrust::scatter(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(src), - thrust::device_pointer_cast(src + n), - thrust::device_pointer_cast(map), - thrust::device_pointer_cast(dst)); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -void printv(size_t n, T* vec, int offset) -{ - thrust::device_ptr dev_ptr(vec); - std::cout.precision(15); - std::cout << "sample size = " << n << ", offset = " << offset << std::endl; - thrust::copy( - dev_ptr + offset, - dev_ptr + offset + n, - std::ostream_iterator( - std::cout, " ")); // Assume no RMM dependency; TODO: check / test (potential BUG !!!!!) - RAFT_CHECK_CUDA(nullptr); - std::cout << std::endl; -} - -template -void copy(size_t n, T* x, T* res) -{ - thrust::device_ptr dev_ptr(x); - thrust::device_ptr res_ptr(res); - rmm::cuda_stream_view stream_view; - thrust::copy_n(rmm::exec_policy(stream_view), dev_ptr, n, res_ptr); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -struct is_zero { - __host__ __device__ bool operator()(const T x) { return x == 0; } -}; - -template -struct dangling_functor : public thrust::unary_function { - const T val; - dangling_functor(T _val) : val(_val) {} - __host__ __device__ T operator()(const T& x) const { return val + x; } -}; - -template -void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) -{ - rmm::cuda_stream_view stream_view; - thrust::transform_if(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(dangling_nodes), - thrust::device_pointer_cast(dangling_nodes + n), - thrust::device_pointer_cast(dangling_nodes), - dangling_functor(1.0 - damping_factor), - is_zero()); - RAFT_CHECK_CUDA(stream_view.value()); -} - -// google matrix kernels -template -__global__ static void degree_coo(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - atomicAdd(°ree[ind[i]], (ValueType)1.0); -} - -template -__global__ static void flag_leafs_kernel(const size_t n, - const IndexType* degree, - ValueType* bookmark) -{ - for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - if (degree[i] == 0) bookmark[i] = 1.0; -} - -template -__global__ static void degree_offsets(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - degree[i] += ind[i + 1] - ind[i]; -} - -template -__global__ static void type_convert(FromType* array, int n) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { - ToType val = array[i]; - ToType* vals = (ToType*)array; - vals[i] = val; - } -} - -template -__global__ static void equi_prob3(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) -{ - int j, row, col; - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - val[j] = 1.0 / degree[col]; - // val[j] = 999; - } - } -} - -template -__global__ static void equi_prob2(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) -{ - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row < n) { - int row_begin = csrPtr[row]; - int row_end = csrPtr[row + 1]; - int col; - for (int i = row_begin; i < row_end; i++) { - col = csrInd[i]; - val[i] = 1.0 / degree[col]; - } - } -} - -// compute the H^T values for an already transposed adjacency matrix, leveraging coo info -template -void HT_matrix_csc_coo(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - ValueType* bookmark) -{ - rmm::cuda_stream_view stream_view; - rmm::device_uvector degree(n, stream_view); - - dim3 nthreads, nblocks; - nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - degree_coo - <<>>(n, e, csrInd, degree.data()); - RAFT_CHECK_CUDA(stream_view.value()); - - int y = 4; - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); // 1; - equi_prob3 - <<>>(n, e, csrPtr, csrInd, val, degree.data()); - RAFT_CHECK_CUDA(stream_view.value()); - - ValueType a = 0.0; - fill(n, bookmark, a); - RAFT_CHECK_CUDA(stream_view.value()); - - nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - flag_leafs_kernel - <<>>(n, degree.data(), bookmark); - RAFT_CHECK_CUDA(stream_view.value()); -} - -template -__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel( - const offsets_t* offsets, index_t v, index_t* indices) -{ - auto tid{threadIdx.x}; - auto ctaStart{blockIdx.x}; - - for (index_t j = ctaStart; j < v; j += gridDim.x) { - offsets_t colStart = offsets[j]; - offsets_t colEnd = offsets[j + 1]; - offsets_t rowNnz = colEnd - colStart; - - for (offsets_t i = 0; i < rowNnz; i += blockDim.x) { - if ((colStart + tid + i) < colEnd) { indices[colStart + tid + i] = j; } - } - } -} - -template -void offsets_to_indices(const offsets_t* offsets, index_t v, index_t* indices) -{ - cudaStream_t stream{nullptr}; - index_t nthreads = min(v, (index_t)CUDA_MAX_KERNEL_THREADS); - index_t nblocks = min((v + nthreads - 1) / nthreads, (index_t)CUDA_MAX_BLOCKS); - offsets_to_indices_kernel<<>>(offsets, v, indices); - RAFT_CHECK_CUDA(stream); -} - -template -void sequence(IndexType n, IndexType* vec, IndexType init = 0) -{ - thrust::sequence( - thrust::device, thrust::device_pointer_cast(vec), thrust::device_pointer_cast(vec + n), init); - RAFT_CHECK_CUDA(nullptr); -} - -template -bool has_negative_val(DistType* arr, size_t n) -{ - // custom kernel with boolean bitwise reduce may be - // faster. - rmm::cuda_stream_view stream_view; - DistType result = *thrust::min_element(rmm::exec_policy(stream_view), - thrust::device_pointer_cast(arr), - thrust::device_pointer_cast(arr + n)); - - RAFT_CHECK_CUDA(stream_view.value()); - - return (result < 0); -} - -} // namespace detail -} // namespace cugraph - -#include "eidecl_graph_utils.hpp" diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 52d257b9bea..da31f498de1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -39,6 +39,7 @@ add_library(cugraphtestutil STATIC utilities/misc_utilities.cpp utilities/conversion_utilities_sg.cu utilities/debug_utilities_sg.cpp + utilities/validation_utilities.cu link_prediction/similarity_compare.cpp centrality/betweenness_centrality_validate.cu community/egonet_validate.cu @@ -487,6 +488,10 @@ ConfigureTest(BIASED_NEIGHBOR_SAMPLING_TEST sampling/biased_neighbor_sampling.cp # - SAMPLING_POST_PROCESSING tests ---------------------------------------------------------------- ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu) +################################################################################################### +# - NEGATIVE SAMPLING tests -------------------------------------------------------------------- +ConfigureTest(NEGATIVE_SAMPLING_TEST sampling/negative_sampling.cpp) + ################################################################################################### # - Renumber tests -------------------------------------------------------------------------------- ConfigureTest(RENUMBERING_TEST structure/renumbering_test.cpp) @@ -741,6 +746,11 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG BIASED NBR SAMPLING tests -------------------------------------------------------------- ConfigureTestMG(MG_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_biased_neighbor_sampling.cpp) + ################################################################################################### + # - NEGATIVE SAMPLING tests -------------------------------------------------------------------- + ConfigureTestMG(MG_NEGATIVE_SAMPLING_TEST sampling/mg_negative_sampling.cpp) + + ############################################################################################### # - MG RANDOM_WALKS tests --------------------------------------------------------------------- ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp) @@ -773,6 +783,7 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureCTestMG(MG_CAPI_HITS_TEST c_api/mg_hits_test.c) ConfigureCTestMG(MG_CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/mg_uniform_neighbor_sample_test.c) ConfigureCTestMG(MG_CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/mg_biased_neighbor_sample_test.c) + ConfigureCTestMG(MG_CAPI_NEGATIVE_SAMPLING_TEST c_api/mg_negative_sampling_test.c) ConfigureCTestMG(MG_CAPI_LOOKUP_SRC_DST_TEST c_api/mg_lookup_src_dst_test.c) ConfigureCTestMG(MG_CAPI_RANDOM_WALKS_TEST c_api/mg_random_walks_test.c) ConfigureCTestMG(MG_CAPI_TRIANGLE_COUNT_TEST c_api/mg_triangle_count_test.c) @@ -812,6 +823,7 @@ ConfigureCTest(CAPI_WEAKLY_CONNECTED_COMPONENTS_TEST c_api/weakly_connected_comp ConfigureCTest(CAPI_STRONGLY_CONNECTED_COMPONENTS_TEST c_api/strongly_connected_components_test.c) ConfigureCTest(CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/uniform_neighbor_sample_test.c) ConfigureCTest(CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/biased_neighbor_sample_test.c) +ConfigureCTest(CAPI_NEGATIVE_SAMPLING_TEST c_api/negative_sampling_test.c) ConfigureCTest(CAPI_RANDOM_WALKS_TEST c_api/sg_random_walks_test.c) ConfigureCTest(CAPI_TRIANGLE_COUNT_TEST c_api/triangle_count_test.c) ConfigureCTest(CAPI_LOUVAIN_TEST c_api/louvain_test.c) diff --git a/cpp/tests/c_api/mg_negative_sampling_test.c b/cpp/tests/c_api/mg_negative_sampling_test.c new file mode 100644 index 00000000000..3289206d8db --- /dev/null +++ b/cpp/tests/c_api/mg_negative_sampling_test.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mg_test_utils.h" /* RUN_MG_TEST */ + +#include +#include + +#include +#include +#include + +typedef int32_t vertex_t; +typedef int32_t edge_t; +typedef float weight_t; + +data_type_id_t vertex_tid = INT32; +data_type_id_t edge_tid = INT32; +data_type_id_t weight_tid = FLOAT32; +data_type_id_t edge_id_tid = INT32; +data_type_id_t edge_type_tid = INT32; + +int generic_negative_sampling_test(const cugraph_resource_handle_t* handle, + vertex_t* h_src, + vertex_t* h_dst, + size_t num_vertices, + size_t num_edges, + size_t num_samples, + vertex_t* h_vertices, + weight_t* h_src_bias, + weight_t* h_dst_bias, + size_t num_biases, + bool_t remove_duplicates, + bool_t remove_false_negatives, + bool_t exact_number_of_samples) +{ + // Create graph + int test_ret_value = 0; + cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; + cugraph_error_t* ret_error = NULL; + cugraph_graph_t* graph = NULL; + cugraph_coo_t* result = NULL; + + ret_code = create_mg_test_graph_new(handle, + vertex_tid, + edge_tid, + h_src, + h_dst, + weight_tid, + NULL, + edge_type_tid, + NULL, + edge_id_tid, + NULL, + num_edges, + FALSE, + TRUE, + FALSE, + FALSE, + &graph, + &ret_error); + + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + cugraph_type_erased_device_array_t* d_vertices = NULL; + cugraph_type_erased_device_array_view_t* d_vertices_view = NULL; + cugraph_type_erased_device_array_t* d_src_bias = NULL; + cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL; + cugraph_type_erased_device_array_t* d_dst_bias = NULL; + cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL; + + int rank = cugraph_resource_handle_get_rank(handle); + + if (num_biases > 0) { + if (rank == 0) { + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, vertex_tid, &d_vertices, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed."); + + d_vertices_view = cugraph_type_erased_device_array_view(d_vertices); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_vertices_view, (byte_t*)h_vertices, &ret_error); + + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, weight_tid, &d_src_bias, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed."); + + d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error); + + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, weight_tid, &d_dst_bias, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed."); + + d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error); + } else { + d_vertices_view = cugraph_type_erased_device_array_view_create(NULL, 0, vertex_tid); + d_src_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid); + d_dst_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid); + } + } + + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + + ret_code = cugraph_negative_sampling(handle, + rng_state, + graph, + d_vertices_view, + d_src_bias_view, + d_dst_bias_view, + num_samples, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples, + FALSE, + &result, + &ret_error); + + cugraph_type_erased_device_array_view_t* result_srcs = NULL; + cugraph_type_erased_device_array_view_t* result_dsts = NULL; + + result_srcs = cugraph_coo_get_sources(result); + result_dsts = cugraph_coo_get_destinations(result); + + size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs); + + vertex_t h_result_srcs[result_size]; + vertex_t h_result_dsts[result_size]; + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_srcs, result_srcs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_dsts, result_dsts, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + // First, check that all edges are actually part of the graph + int32_t M_exists[num_vertices][num_vertices]; + int32_t M_duplicates[num_vertices][num_vertices]; + + for (int i = 0; i < num_vertices; ++i) + for (int j = 0; j < num_vertices; ++j) { + M_exists[i][j] = 0; + M_duplicates[i][j] = 0; + } + + for (int i = 0; i < num_edges; ++i) { + M_exists[h_src[i]][h_dst[i]] = 1; + } + + for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { + TEST_ASSERT(test_ret_value, + (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices), + "negative_sampling generated an edge that with an invalid vertex"); + TEST_ASSERT(test_ret_value, + (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices), + "negative_sampling generated an edge that with an invalid vertex"); + if (remove_false_negatives == TRUE) { + TEST_ASSERT(test_ret_value, + M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0, + "negative_sampling generated a false negative edge that should be suppressed"); + } + + if (remove_duplicates == TRUE) { + TEST_ASSERT(test_ret_value, + M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0, + "negative_sampling generated a duplicate edge that should be suppressed"); + M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1; + } + } + + if (exact_number_of_samples == TRUE) + TEST_ASSERT(test_ret_value, + result_size == num_samples, + "negative_sampling generated a result with an incorrect number of samples"); + + cugraph_type_erased_device_array_view_free(d_vertices_view); + cugraph_type_erased_device_array_view_free(d_src_bias_view); + cugraph_type_erased_device_array_view_free(d_dst_bias_view); + cugraph_type_erased_device_array_free(d_vertices); + cugraph_type_erased_device_array_free(d_src_bias); + cugraph_type_erased_device_array_free(d_dst_bias); + cugraph_coo_free(result); + cugraph_mg_graph_free(graph); + cugraph_error_free(ret_error); + return test_ret_value; +} + +int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle) +{ + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + size_t num_edges = 9; + size_t num_vertices = 6; + size_t num_biases = 0; + size_t num_samples = 10; + + vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5}; + + bool_t remove_duplicates = FALSE; + bool_t remove_false_negatives = TRUE; + bool_t exact_number_of_samples = FALSE; + + return generic_negative_sampling_test(handle, + src, + dst, + num_vertices, + num_edges, + num_samples, + NULL, + NULL, + NULL, + num_biases, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples); +} + +int test_negative_sampling_biased(const cugraph_resource_handle_t* handle) +{ + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + size_t num_edges = 9; + size_t num_vertices = 6; + size_t num_biases = 6; + size_t num_samples = 10; + + vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5}; + weight_t src_bias[] = {1, 1, 2, 2, 1, 1}; + weight_t dst_bias[] = {2, 2, 1, 1, 1, 1}; + vertex_t vertices[] = {0, 1, 2, 3, 4, 5}; + + bool_t remove_duplicates = FALSE; + bool_t remove_false_negatives = TRUE; + bool_t exact_number_of_samples = FALSE; + + return generic_negative_sampling_test(handle, + src, + dst, + num_vertices, + num_edges, + num_samples, + vertices, + src_bias, + dst_bias, + num_biases, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples); +} + +/******************************************************************************/ + +int main(int argc, char** argv) +{ + void* raft_handle = create_mg_raft_handle(argc, argv); + cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle); + + int result = 0; + result |= RUN_MG_TEST(test_negative_sampling_uniform, handle); + result |= RUN_MG_TEST(test_negative_sampling_biased, handle); + + cugraph_free_resource_handle(handle); + free_mg_raft_handle(raft_handle); + + return result; +} diff --git a/cpp/tests/c_api/negative_sampling_test.c b/cpp/tests/c_api/negative_sampling_test.c new file mode 100644 index 00000000000..5e8d3f7e765 --- /dev/null +++ b/cpp/tests/c_api/negative_sampling_test.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_test_utils.h" /* RUN_TEST */ + +#include +#include + +#include +#include +#include + +typedef int32_t vertex_t; +typedef int32_t edge_t; +typedef float weight_t; + +data_type_id_t vertex_tid = INT32; +data_type_id_t edge_tid = INT32; +data_type_id_t weight_tid = FLOAT32; +data_type_id_t edge_id_tid = INT32; +data_type_id_t edge_type_tid = INT32; + +int generic_negative_sampling_test(const cugraph_resource_handle_t* handle, + vertex_t* h_src, + vertex_t* h_dst, + size_t num_vertices, + size_t num_edges, + size_t num_samples, + vertex_t* h_vertices, + weight_t* h_src_bias, + weight_t* h_dst_bias, + size_t num_biases, + bool_t remove_duplicates, + bool_t remove_false_negatives, + bool_t exact_number_of_samples) +{ + // Create graph + int test_ret_value = 0; + cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; + cugraph_error_t* ret_error = NULL; + cugraph_graph_t* graph = NULL; + cugraph_coo_t* result = NULL; + + ret_code = create_sg_test_graph(handle, + vertex_tid, + edge_tid, + h_src, + h_dst, + weight_tid, + NULL, + edge_type_tid, + NULL, + edge_id_tid, + NULL, + num_edges, + FALSE, + TRUE, + FALSE, + FALSE, + &graph, + &ret_error); + + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + cugraph_type_erased_device_array_t* d_vertices = NULL; + cugraph_type_erased_device_array_view_t* d_vertices_view = NULL; + cugraph_type_erased_device_array_t* d_src_bias = NULL; + cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL; + cugraph_type_erased_device_array_t* d_dst_bias = NULL; + cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL; + + if (num_biases > 0) { + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, vertex_tid, &d_vertices, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed."); + + d_vertices_view = cugraph_type_erased_device_array_view(d_vertices); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_vertices_view, (byte_t*)h_vertices, &ret_error); + + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, weight_tid, &d_src_bias, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed."); + + d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error); + + ret_code = cugraph_type_erased_device_array_create( + handle, num_biases, weight_tid, &d_dst_bias, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed."); + + d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error); + } + + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + + ret_code = cugraph_negative_sampling(handle, + rng_state, + graph, + d_vertices_view, + d_src_bias_view, + d_dst_bias_view, + num_samples, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples, + FALSE, + &result, + &ret_error); + + cugraph_type_erased_device_array_view_t* result_srcs = NULL; + cugraph_type_erased_device_array_view_t* result_dsts = NULL; + + result_srcs = cugraph_coo_get_sources(result); + result_dsts = cugraph_coo_get_destinations(result); + + size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs); + + vertex_t h_result_srcs[result_size]; + vertex_t h_result_dsts[result_size]; + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_srcs, result_srcs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_dsts, result_dsts, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + // First, check that all edges are actually part of the graph + int32_t M_exists[num_vertices][num_vertices]; + int32_t M_duplicates[num_vertices][num_vertices]; + + for (int i = 0; i < num_vertices; ++i) + for (int j = 0; j < num_vertices; ++j) { + M_exists[i][j] = 0; + M_duplicates[i][j] = 0; + } + + for (int i = 0; i < num_edges; ++i) { + M_exists[h_src[i]][h_dst[i]] = 1; + } + + for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { + TEST_ASSERT(test_ret_value, + (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices), + "negative_sampling generated an edge that with an invalid vertex"); + TEST_ASSERT(test_ret_value, + (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices), + "negative_sampling generated an edge that with an invalid vertex"); + if (remove_false_negatives == TRUE) { + TEST_ASSERT(test_ret_value, + M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0, + "negative_sampling generated a false negative edge that should be suppressed"); + } + + if (remove_duplicates == TRUE) { + TEST_ASSERT(test_ret_value, + M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0, + "negative_sampling generated a duplicate edge that should be suppressed"); + M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1; + } + } + + if (exact_number_of_samples == TRUE) + TEST_ASSERT(test_ret_value, + result_size == num_samples, + "negative_sampling generated a result with an incorrect number of samples"); + + cugraph_type_erased_device_array_view_free(d_vertices_view); + cugraph_type_erased_device_array_view_free(d_src_bias_view); + cugraph_type_erased_device_array_view_free(d_dst_bias_view); + cugraph_type_erased_device_array_free(d_vertices); + cugraph_type_erased_device_array_free(d_src_bias); + cugraph_coo_free(result); + cugraph_sg_graph_free(graph); + cugraph_error_free(ret_error); + return test_ret_value; +} + +int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle) +{ + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + size_t num_edges = 9; + size_t num_vertices = 6; + size_t num_biases = 0; + size_t num_samples = 10; + + vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5}; + + bool_t remove_duplicates = FALSE; + bool_t remove_false_negatives = TRUE; + bool_t exact_number_of_samples = FALSE; + + return generic_negative_sampling_test(handle, + src, + dst, + num_vertices, + num_edges, + num_samples, + NULL, + NULL, + NULL, + num_biases, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples); +} + +int test_negative_sampling_biased(const cugraph_resource_handle_t* handle) +{ + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + size_t num_edges = 9; + size_t num_vertices = 6; + size_t num_biases = 6; + size_t num_samples = 10; + + vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5}; + weight_t src_bias[] = {1, 1, 2, 2, 1, 1}; + weight_t dst_bias[] = {2, 2, 1, 1, 1, 1}; + vertex_t vertices[] = {0, 1, 2, 3, 4, 5}; + + bool_t remove_duplicates = FALSE; + bool_t remove_false_negatives = TRUE; + bool_t exact_number_of_samples = FALSE; + + return generic_negative_sampling_test(handle, + src, + dst, + num_vertices, + num_edges, + num_samples, + vertices, + src_bias, + dst_bias, + num_biases, + remove_duplicates, + remove_false_negatives, + exact_number_of_samples); +} + +int main(int argc, char** argv) +{ + cugraph_resource_handle_t* handle = NULL; + + handle = cugraph_create_resource_handle(NULL); + + int result = 0; + result |= RUN_TEST_NEW(test_negative_sampling_uniform, handle); + result |= RUN_TEST_NEW(test_negative_sampling_biased, handle); + + cugraph_free_resource_handle(handle); + + return result; +} diff --git a/cpp/tests/sampling/detail/nbr_sampling_validate.cu b/cpp/tests/sampling/detail/nbr_sampling_validate.cu index 61731e2e15c..70828e559f1 100644 --- a/cpp/tests/sampling/detail/nbr_sampling_validate.cu +++ b/cpp/tests/sampling/detail/nbr_sampling_validate.cu @@ -75,6 +75,8 @@ struct ArithmeticZipLess { } else { return thrust::get<1>(left) < thrust::get<1>(right); } + } else { + return false; } } }; diff --git a/cpp/tests/sampling/mg_negative_sampling.cpp b/cpp/tests/sampling/mg_negative_sampling.cpp new file mode 100644 index 00000000000..7c64bb7fbbb --- /dev/null +++ b/cpp/tests/sampling/mg_negative_sampling.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utilities/base_fixture.hpp" +#include "utilities/conversion_utilities.hpp" +#include "utilities/property_generator_utilities.hpp" +#include "utilities/validation_utilities.hpp" + +#include +#include + +#include + +struct Negative_Sampling_Usecase { + float sample_multiplier{2}; + bool use_src_bias{false}; + bool use_dst_bias{false}; + bool remove_duplicates{false}; + bool remove_existing_edges{false}; + bool exact_number_of_samples{false}; + bool edge_masking{false}; + bool check_correctness{true}; +}; + +template +class Tests_MGNegative_Sampling : public ::testing::TestWithParam { + public: + using graph_t = cugraph::graph_t; + using graph_view_t = cugraph::graph_view_t; + + Tests_MGNegative_Sampling() : graph_(*handle_) {} + + static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); } + + static void TearDownTestCase() { handle_.reset(); } + + template + void load_graph(input_t const& param) + { + HighResTimer hr_timer{}; + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Construct graph"); + } + + std::tie(graph_, edge_weights_, renumber_map_labels_) = + cugraph::test::construct_graph( + *handle_, param, true, true); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + edge_mask_ = + cugraph::test::generate::edge_property(*handle_, graph_.view(), 2); + } + + virtual void SetUp() {} + virtual void TearDown() {} + + void run_current_test(raft::random::RngState& rng_state, + Negative_Sampling_Usecase const& negative_sampling_usecase) + { + constexpr bool do_expensive_check{false}; + + HighResTimer hr_timer{}; + + auto graph_view = graph_.view(); + + if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); } + + size_t num_samples = + graph_view.compute_number_of_edges(*handle_) * negative_sampling_usecase.sample_multiplier; + + rmm::device_uvector src_bias_v(0, handle_->get_stream()); + rmm::device_uvector dst_bias_v(0, handle_->get_stream()); + + std::optional> src_bias{std::nullopt}; + std::optional> dst_bias{std::nullopt}; + + if (negative_sampling_usecase.use_src_bias) { + src_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream()); + + cugraph::detail::uniform_random_fill(handle_->get_stream(), + src_bias_v.data(), + src_bias_v.size(), + weight_t{1}, + weight_t{10}, + rng_state); + + src_bias = raft::device_span{src_bias_v.data(), src_bias_v.size()}; + } + + if (negative_sampling_usecase.use_dst_bias) { + dst_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream()); + + cugraph::detail::uniform_random_fill(handle_->get_stream(), + dst_bias_v.data(), + dst_bias_v.size(), + weight_t{1}, + weight_t{10}, + rng_state); + + dst_bias = raft::device_span{dst_bias_v.data(), dst_bias_v.size()}; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Negative sampling"); + } + + auto&& [src_out, dst_out] = + cugraph::negative_sampling(*handle_, + rng_state, + graph_view, + src_bias, + dst_bias, + num_samples, + negative_sampling_usecase.remove_duplicates, + negative_sampling_usecase.remove_existing_edges, + negative_sampling_usecase.exact_number_of_samples, + do_expensive_check); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + if (negative_sampling_usecase.check_correctness) { + ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch"; + + cugraph::test::sort(*handle_, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}); + + // TODO: Move this to validation_utilities... + auto h_vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts(); + rmm::device_uvector d_vertex_partition_range_lasts( + h_vertex_partition_range_lasts.size(), handle_->get_stream()); + raft::update_device(d_vertex_partition_range_lasts.data(), + h_vertex_partition_range_lasts.data(), + h_vertex_partition_range_lasts.size(), + handle_->get_stream()); + + size_t error_count = cugraph::test::count_edges_on_wrong_int_gpu( + *handle_, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}, + raft::device_span{d_vertex_partition_range_lasts.data(), + d_vertex_partition_range_lasts.size()}); + + ASSERT_EQ(error_count, 0) << "generate edges out of range > 0"; + + if ((negative_sampling_usecase.remove_duplicates) && (src_out.size() > 0)) { + error_count = cugraph::test::count_duplicate_vertex_pairs_sorted( + *handle_, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}); + ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries"; + } + + if (negative_sampling_usecase.remove_existing_edges) { + rmm::device_uvector graph_src(0, handle_->get_stream()); + rmm::device_uvector graph_dst(0, handle_->get_stream()); + + std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) = + cugraph::decompress_to_edgelist( + *handle_, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt); + + error_count = cugraph::test::count_intersection( + *handle_, + raft::device_span{graph_src.data(), graph_src.size()}, + raft::device_span{graph_dst.data(), graph_dst.size()}, + std::nullopt, + std::nullopt, + std::nullopt, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}, + std::nullopt, + std::nullopt, + std::nullopt); + ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges"; + } + + if (negative_sampling_usecase.exact_number_of_samples) { + size_t sz = cugraph::host_scalar_allreduce( + handle_->get_comms(), src_out.size(), raft::comms::op_t::SUM, handle_->get_stream()); + ASSERT_EQ(sz, num_samples) << "Expected exact number of samples"; + } + + // TBD: How do we determine if we have properly reflected the biases? + } + } + + public: + static std::unique_ptr handle_; + + private: + graph_t graph_; + std::optional> edge_weights_{std::nullopt}; + std::optional> edge_mask_{std::nullopt}; + std::optional> renumber_map_labels_{std::nullopt}; +}; + +template +std::unique_ptr + Tests_MGNegative_Sampling::handle_ = nullptr; + +using Tests_MGNegative_Sampling_File_i64_i64_float = + Tests_MGNegative_Sampling; + +using Tests_MGNegative_Sampling_Rmat_i64_i64_float = + Tests_MGNegative_Sampling; + +template +void run_all_tests(CurrentTest* current_test) +{ + raft::random::RngState rng_state{ + static_cast(current_test->handle_->get_comms().get_rank())}; + + for (bool use_src_bias : {false, true}) + for (bool use_dst_bias : {false, true}) + for (bool remove_duplicates : {false, true}) + for (bool remove_existing_edges : {false, true}) + for (bool exact_number_of_samples : {false, true}) + for (bool edge_masking : {false, true}) + current_test->run_current_test(rng_state, + Negative_Sampling_Usecase{2, + use_src_bias, + use_dst_bias, + remove_duplicates, + remove_existing_edges, + exact_number_of_samples, + edge_masking}); +} + +TEST_P(Tests_MGNegative_Sampling_File_i64_i64_float, CheckInt64Int64Float) +{ + load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_MGNegative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float) +{ + load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +INSTANTIATE_TEST_SUITE_P( + file_test, + Tests_MGNegative_Sampling_File_i64_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_large_test, + Tests_MGNegative_Sampling_File_i64_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), + cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), + cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_MGNegative_Sampling_Rmat_i64_i64_float, + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))); + +INSTANTIATE_TEST_SUITE_P( + rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with + --gtest_filter to select only the rmat_benchmark_test with a specific + vertex & edge type combination) by command line arguments and do not + include more than one Rmat_Usecase that differ only in scale or edge + factor (to avoid running same benchmarks more than once) */ + Tests_MGNegative_Sampling_Rmat_i64_i64_float, + ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/sampling/negative_sampling.cpp b/cpp/tests/sampling/negative_sampling.cpp new file mode 100644 index 00000000000..ba929c63e9b --- /dev/null +++ b/cpp/tests/sampling/negative_sampling.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utilities/base_fixture.hpp" +#include "utilities/conversion_utilities.hpp" +#include "utilities/property_generator_utilities.hpp" +#include "utilities/validation_utilities.hpp" + +#include +#include + +#include + +struct Negative_Sampling_Usecase { + float sample_multiplier{2}; + bool use_src_bias{false}; + bool use_dst_bias{false}; + bool remove_duplicates{false}; + bool remove_existing_edges{false}; + bool exact_number_of_samples{false}; + bool edge_masking{false}; + bool check_correctness{true}; +}; + +template +class Tests_Negative_Sampling : public ::testing::TestWithParam { + public: + using graph_t = cugraph::graph_t; + using graph_view_t = cugraph::graph_view_t; + + Tests_Negative_Sampling() : graph_(raft::handle_t{}) {} + + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + template + void load_graph(input_t const& param) + { + raft::handle_t handle{}; + HighResTimer hr_timer{}; + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Construct graph"); + } + + std::tie(graph_, edge_weights_, renumber_map_labels_) = + cugraph::test::construct_graph( + handle, param, true, true); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + edge_mask_ = + cugraph::test::generate::edge_property(handle, graph_.view(), 2); + } + + virtual void SetUp() {} + virtual void TearDown() {} + + void run_current_test(raft::random::RngState& rng_state, + Negative_Sampling_Usecase const& negative_sampling_usecase) + { + constexpr bool do_expensive_check{false}; + + raft::handle_t handle{}; + HighResTimer hr_timer{}; + + auto graph_view = graph_.view(); + + if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); } + + size_t num_samples = + graph_view.compute_number_of_edges(handle) * negative_sampling_usecase.sample_multiplier; + + rmm::device_uvector src_bias_v(0, handle.get_stream()); + rmm::device_uvector dst_bias_v(0, handle.get_stream()); + + std::optional> src_bias{std::nullopt}; + std::optional> dst_bias{std::nullopt}; + + if (negative_sampling_usecase.use_src_bias) { + src_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream()); + + cugraph::detail::uniform_random_fill(handle.get_stream(), + src_bias_v.data(), + src_bias_v.size(), + weight_t{1}, + weight_t{10}, + rng_state); + + src_bias = raft::device_span{src_bias_v.data(), src_bias_v.size()}; + } + + if (negative_sampling_usecase.use_dst_bias) { + dst_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream()); + + cugraph::detail::uniform_random_fill(handle.get_stream(), + dst_bias_v.data(), + dst_bias_v.size(), + weight_t{1}, + weight_t{10}, + rng_state); + + dst_bias = raft::device_span{dst_bias_v.data(), dst_bias_v.size()}; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Negative sampling"); + } + + auto&& [src_out, dst_out] = + cugraph::negative_sampling(handle, + rng_state, + graph_view, + src_bias, + dst_bias, + num_samples, + negative_sampling_usecase.remove_duplicates, + negative_sampling_usecase.remove_existing_edges, + negative_sampling_usecase.exact_number_of_samples, + do_expensive_check); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + if (negative_sampling_usecase.check_correctness) { + ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch"; + + cugraph::test::sort(handle, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}); + + size_t error_count = cugraph::test::count_invalid_vertices( + handle, + raft::device_span{src_out.data(), src_out.size()}, + graph_view.local_vertex_partition_view()); + ASSERT_EQ(error_count, 0) << "Source vertices out of range > 0"; + + error_count = cugraph::test::count_invalid_vertices( + handle, + raft::device_span{dst_out.data(), dst_out.size()}, + graph_view.local_vertex_partition_view()); + ASSERT_EQ(error_count, 0) << "Dest vertices out of range > 0"; + + if (negative_sampling_usecase.remove_duplicates) { + error_count = cugraph::test::count_duplicate_vertex_pairs_sorted( + handle, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}); + ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries"; + } + + if (negative_sampling_usecase.remove_existing_edges) { + rmm::device_uvector graph_src(0, handle.get_stream()); + rmm::device_uvector graph_dst(0, handle.get_stream()); + + std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) = + cugraph::decompress_to_edgelist( + handle, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt); + + error_count = cugraph::test::count_intersection( + handle, + raft::device_span{graph_src.data(), graph_src.size()}, + raft::device_span{graph_dst.data(), graph_dst.size()}, + std::nullopt, + std::nullopt, + std::nullopt, + raft::device_span{src_out.data(), src_out.size()}, + raft::device_span{dst_out.data(), dst_out.size()}, + std::nullopt, + std::nullopt, + std::nullopt); + + ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges"; + } + + if (negative_sampling_usecase.exact_number_of_samples) { + ASSERT_EQ(src_out.size(), num_samples) << "Expected exact number of samples"; + } + + // TBD: How do we determine if we have properly reflected the biases? + } + } + + private: + graph_t graph_; + std::optional> edge_weights_{std::nullopt}; + std::optional> edge_mask_{std::nullopt}; + std::optional> renumber_map_labels_{std::nullopt}; +}; + +using Tests_Negative_Sampling_File_i32_i32_float = + Tests_Negative_Sampling; + +using Tests_Negative_Sampling_File_i32_i64_float = + Tests_Negative_Sampling; + +using Tests_Negative_Sampling_File_i64_i64_float = + Tests_Negative_Sampling; + +using Tests_Negative_Sampling_Rmat_i32_i32_float = + Tests_Negative_Sampling; + +using Tests_Negative_Sampling_Rmat_i32_i64_float = + Tests_Negative_Sampling; + +using Tests_Negative_Sampling_Rmat_i64_i64_float = + Tests_Negative_Sampling; + +template +void run_all_tests(CurrentTest* current_test) +{ + raft::random::RngState rng_state{0}; + + for (bool use_src_bias : {false, true}) + for (bool use_dst_bias : {false, true}) + for (bool remove_duplicates : {false, true}) + for (bool remove_existing_edges : {false, true}) + for (bool exact_number_of_samples : {false, true}) + for (bool edge_masking : {false, true}) + current_test->run_current_test(rng_state, + Negative_Sampling_Usecase{2, + use_src_bias, + use_dst_bias, + remove_duplicates, + remove_existing_edges, + exact_number_of_samples, + edge_masking}); +} + +TEST_P(Tests_Negative_Sampling_File_i32_i32_float, CheckInt32Int32Float) +{ + load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_Negative_Sampling_File_i32_i64_float, CheckInt32Int64Float) +{ + load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_Negative_Sampling_File_i64_i64_float, CheckInt64Int64Float) +{ + load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_Negative_Sampling_Rmat_i32_i32_float, CheckInt32Int32Float) +{ + load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_Negative_Sampling_Rmat_i32_i64_float, CheckInt32Int64Float) +{ + load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +TEST_P(Tests_Negative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float) +{ + load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam())); + run_all_tests(this); +} + +INSTANTIATE_TEST_SUITE_P( + file_test, + Tests_Negative_Sampling_File_i32_i32_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_large_test, + Tests_Negative_Sampling_File_i32_i32_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), + cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), + cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_test, + Tests_Negative_Sampling_File_i32_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_large_test, + Tests_Negative_Sampling_File_i32_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), + cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), + cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_test, + Tests_Negative_Sampling_File_i64_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + file_large_test, + Tests_Negative_Sampling_File_i64_i64_float, + ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), + cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), + cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))); + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_Negative_Sampling_Rmat_i32_i32_float, + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))); + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_Negative_Sampling_Rmat_i32_i64_float, + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))); + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_Negative_Sampling_Rmat_i64_i64_float, + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))); + +INSTANTIATE_TEST_SUITE_P( + rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with + --gtest_filter to select only the rmat_benchmark_test with a specific + vertex & edge type combination) by command line arguments and do not + include more than one Rmat_Usecase that differ only in scale or edge + factor (to avoid running same benchmarks more than once) */ + Tests_Negative_Sampling_Rmat_i64_i64_float, + ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))); + +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/validation_utilities.cu b/cpp/tests/utilities/validation_utilities.cu new file mode 100644 index 00000000000..3da998ad626 --- /dev/null +++ b/cpp/tests/utilities/validation_utilities.cu @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "detail/graph_partition_utils.cuh" +#include "utilities/validation_utilities.hpp" + +#include + +#include +#include +#include +#include + +namespace cugraph::test { + +template +size_t count_invalid_vertices( + raft::handle_t const& handle, + raft::device_span vertices, + cugraph::vertex_partition_view_t const& vertex_partition_view) +{ + return thrust::count_if( + handle.get_thrust_policy(), + vertices.begin(), + vertices.end(), + [vertex_partition = cugraph::vertex_partition_device_view_t{ + vertex_partition_view}] __device__(auto val) { + return !(vertex_partition.is_valid_vertex(val) && + vertex_partition.in_local_vertex_partition_range_nocheck(val)); + }); +} + +template +size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle, + raft::device_span src, + raft::device_span dst) +{ + return thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(src.size()), + [src, dst] __device__(size_t index) { + return (src[index - 1] == src[index]) && (dst[index - 1] == dst[index]); + }); +} + +// FIXME: Resolve this with dataframe_buffer variations in thrust_wrappers.cu +template +void sort(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts) +{ + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(srcs.begin(), dsts.begin()), + thrust::make_zip_iterator(srcs.end(), dsts.end())); +} + +template +size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2) +{ + // FIXME: Add support for wgts, edgeids and edge_types... + // Added to the API for future support. + + auto iter1 = thrust::make_zip_iterator(srcs1.begin(), dsts1.begin()); + auto iter2 = thrust::make_zip_iterator(srcs2.begin(), dsts2.begin()); + auto output_iter = thrust::make_discard_iterator(); + + return thrust::distance(output_iter, + thrust::set_intersection(handle.get_thrust_policy(), + iter1, + iter1 + srcs1.size(), + iter2, + iter2 + srcs2.size(), + output_iter)); +#if 0 + // OLD Approach + return thrust::count_if( + handle.get_thrust_policy(), + thrust::make_zip_iterator(src_out.begin(), dst_out.begin()), + thrust::make_zip_iterator(src_out.end(), dst_out.end()), + cuda::proclaim_return_type( + [src = raft::device_span{graph_src.data(), graph_src.size()}, + dst = raft::device_span{graph_dst.data(), + graph_dst.size()}] __device__(auto tuple) { +#if 0 + // FIXME: This fails on rocky linux CUDA 11.8, works on CUDA 12 + return thrust::binary_search(thrust::seq, + thrust::make_zip_iterator(src.begin(), dst.begin()), + thrust::make_zip_iterator(src.end(), dst.end()), + tuple) ? size_t{1} : size_t{0}; +#else + auto lb = thrust::distance( + src.begin(), + thrust::lower_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple))); + auto ub = thrust::distance( + src.begin(), + thrust::upper_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple))); + + if (src.data()[lb] == thrust::get<0>(tuple)) { + return thrust::binary_search( + thrust::seq, dst.begin() + lb, dst.begin() + ub, thrust::get<1>(tuple)) + ? size_t{1} + : size_t{0}; + } else { + return size_t{0}; + } +#endif + })); +#endif +} + +template +size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts, + raft::device_span vertex_partition_range_lasts) +{ + return thrust::count_if( + handle.get_thrust_policy(), + thrust::make_zip_iterator(srcs.begin(), dsts.begin()), + thrust::make_zip_iterator(srcs.end(), dsts.end()), + [comm_rank = handle.get_comms().get_rank(), + gpu_id_key_func = cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t{ + vertex_partition_range_lasts, + handle.get_comms().get_size(), + handle.get_subcomm(cugraph::partition_manager::major_comm_name()).get_size(), + handle.get_subcomm(cugraph::partition_manager::minor_comm_name()) + .get_size()}] __device__(auto e) { + return (gpu_id_key_func(thrust::get<0>(e), thrust::get<1>(e)) != comm_rank); + }); +} + +// TODO: Split SG from MG? +template size_t count_invalid_vertices( + raft::handle_t const& handle, + raft::device_span vertices, + cugraph::vertex_partition_view_t const& vertex_partition_view); + +template size_t count_invalid_vertices( + raft::handle_t const& handle, + raft::device_span vertices, + cugraph::vertex_partition_view_t const& vertex_partition_view); + +template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle, + raft::device_span src, + raft::device_span dst); + +template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle, + raft::device_span src, + raft::device_span dst); + +template void sort(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts); +template void sort(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template size_t count_edges_on_wrong_int_gpu( + raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts, + raft::device_span vertex_partition_range_lasts); + +template size_t count_edges_on_wrong_int_gpu( + raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts, + raft::device_span vertex_partition_range_lasts); + +} // namespace cugraph::test diff --git a/cpp/tests/utilities/validation_utilities.hpp b/cpp/tests/utilities/validation_utilities.hpp new file mode 100644 index 00000000000..b94ceaf68be --- /dev/null +++ b/cpp/tests/utilities/validation_utilities.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cugraph::test { +template +size_t count_invalid_vertices( + raft::handle_t const& handle, + raft::device_span vertices, + cugraph::vertex_partition_view_t const& vertex_partition); + +template +size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle, + raft::device_span src, + raft::device_span dst); + +template +void sort(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts); + +template +size_t count_intersection(raft::handle_t const& handle, + raft::device_span srcs1, + raft::device_span dsts1, + std::optional> wgts1, + std::optional> edge_ids1, + std::optional> edge_types1, + raft::device_span srcs2, + raft::device_span dsts2, + std::optional> wgts2, + std::optional> edge_ids2, + std::optional> edge_types2); + +template +size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle, + raft::device_span srcs, + raft::device_span dsts, + raft::device_span vertex_partition_range_lasts); + +} // namespace cugraph::test diff --git a/dependencies.yaml b/dependencies.yaml index 0d2d8c5325d..1fb47d9e368 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -20,6 +20,7 @@ files: - depends_on_dask_cudf - depends_on_pylibraft - depends_on_raft_dask + # Deprecate pylibcugraphops - depends_on_pylibcugraphops - depends_on_pylibwholegraph - depends_on_cupy @@ -44,6 +45,7 @@ files: - cuda_version - docs - py_version + # Deprecate pylibcugraphops - depends_on_pylibcugraphops test_cpp: output: none @@ -135,6 +137,7 @@ files: extras: table: project includes: + - cuda_wheels - depends_on_rmm - depends_on_pylibraft py_test_pylibcugraph: @@ -188,6 +191,7 @@ files: table: project includes: - python_run_cugraph_dgl + # Deprecate pylibcugraphops - depends_on_pylibcugraphops py_test_cugraph_dgl: output: pyproject @@ -214,6 +218,7 @@ files: table: project includes: - python_run_cugraph_pyg + # Deprecate pylibcugraphops - depends_on_pylibcugraphops py_test_cugraph_pyg: output: pyproject @@ -239,6 +244,7 @@ files: extras: table: project includes: + # Deprecate pylibcugraphops - depends_on_pylibcugraphops py_test_cugraph_equivariant: output: pyproject @@ -298,6 +304,7 @@ files: conda_dir: python/cugraph-dgl/conda includes: - checks + # Deprecate pylibcugraphops - depends_on_pylibcugraphops - cugraph_dgl_dev - test_python_common @@ -308,6 +315,7 @@ files: conda_dir: python/cugraph-pyg/conda includes: - checks + # Deprecate pylibcugraphops - depends_on_pylibcugraphops - cugraph_pyg_dev - test_python_common @@ -376,6 +384,36 @@ dependencies: packages: - cudatoolkit - cuda-nvtx + cuda_wheels: + specific: + - output_types: pyproject + matrices: + - matrix: + cuda: "12.*" + use_cuda_wheels: "true" + packages: + - nvidia-cublas-cu12 + - nvidia-curand-cu12 + - nvidia-cusolver-cu12 + - nvidia-cusparse-cu12 + # CUDA 11 does not provide wheels, so use the system libraries instead + - matrix: + cuda: "11.*" + use_cuda_wheels: "true" + packages: + # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels + # (e.g. for DLFW and pip devcontainers) + - matrix: + use_cuda_wheels: "false" + packages: + # if no matching matrix selectors passed, list the unsuffixed packages + # (just as a source of documentation, as this populates pyproject.toml in source control) + - matrix: + packages: + - nvidia-cublas + - nvidia-curand + - nvidia-cusolver + - nvidia-cusparse common_build: common: - output_types: [conda, pyproject] @@ -389,6 +427,7 @@ dependencies: - c-compiler - cxx-compiler - libcudf==24.10.*,>=0.0.0a0 + # Deprecate libcugraphops - libcugraphops==24.10.*,>=0.0.0a0 - libraft-headers==24.10.*,>=0.0.0a0 - libraft==24.10.*,>=0.0.0a0 @@ -875,6 +914,7 @@ dependencies: - pylibcugraph-cu11==24.10.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcugraph_unsuffixed]} + # deprecate pylibcugraphops depends_on_pylibcugraphops: common: - output_types: conda diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt index d96d28b6b59..ca38b5551c9 100644 --- a/python/cugraph/CMakeLists.txt +++ b/python/cugraph/CMakeLists.txt @@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de OFF ) option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON) +option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF) if(NOT USE_CUGRAPH_OPS) message(STATUS "Disabling libcugraph functions that reference cugraph-ops") @@ -49,6 +50,8 @@ endif() include(rapids-cython-core) if(NOT cugraph_FOUND) + find_package(CUDAToolkit REQUIRED) + set(BUILD_TESTS OFF) set(BUILD_CUGRAPH_MG_TESTS OFF) set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF) @@ -60,8 +63,26 @@ if(NOT cugraph_FOUND) set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON) set(ALLOW_CLONE_CUGRAPH_OPS ON) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0) + set(CUDA_STATIC_MATH_LIBRARIES OFF) + elseif(USE_CUDA_MATH_WHEELS) + message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0") + endif() + add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL) + if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS) + set(rpaths + "$ORIGIN/../nvidia/cublas/lib" + "$ORIGIN/../nvidia/cufft/lib" + "$ORIGIN/../nvidia/curand/lib" + "$ORIGIN/../nvidia/cusolver/lib" + "$ORIGIN/../nvidia/cusparse/lib" + "$ORIGIN/../nvidia/nvjitlink/lib" + ) + set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND) + endif() + set(cython_lib_dir cugraph) install(TARGETS cugraph DESTINATION ${cython_lib_dir}) endif() diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt index d30b03cb68d..045628e9c0d 100644 --- a/python/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/CMakeLists.txt @@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de OFF ) option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON) +option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF) if(NOT USE_CUGRAPH_OPS) message(STATUS "Disabling libcugraph functions that reference cugraph-ops") @@ -49,6 +50,8 @@ endif() include(rapids-cython-core) if (NOT cugraph_FOUND) + find_package(CUDAToolkit REQUIRED) + set(BUILD_TESTS OFF) set(BUILD_CUGRAPH_MG_TESTS OFF) set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF) @@ -60,8 +63,25 @@ if (NOT cugraph_FOUND) set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON) set(ALLOW_CLONE_CUGRAPH_OPS ON) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0) + set(CUDA_STATIC_MATH_LIBRARIES OFF) + elseif(USE_CUDA_MATH_WHEELS) + message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0") + endif() + add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL) + if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS) + set(rpaths + "$ORIGIN/../nvidia/cublas/lib" + "$ORIGIN/../nvidia/curand/lib" + "$ORIGIN/../nvidia/cusolver/lib" + "$ORIGIN/../nvidia/cusparse/lib" + "$ORIGIN/../nvidia/nvjitlink/lib" + ) + set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND) + endif() + set(cython_lib_dir pylibcugraph) install(TARGETS cugraph DESTINATION ${cython_lib_dir}) install(TARGETS cugraph_c DESTINATION ${cython_lib_dir}) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd new file mode 100644 index 00000000000..e466e6ee5a0 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd @@ -0,0 +1,71 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, +) + +cdef extern from "cugraph_c/coo.h": + ctypedef struct cugraph_coo_t: + pass + + ctypedef struct cugraph_coo_list_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_coo_get_sources( + cugraph_coo_t* coo + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_coo_get_destinations( + cugraph_coo_t* coo + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_coo_get_edge_weights( + cugraph_coo_t* coo + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_coo_get_edge_id( + cugraph_coo_t* coo + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_coo_get_edge_type( + cugraph_coo_t* coo + ) + + cdef size_t \ + cugraph_coo_list_size( + const cugraph_coo_list_t* coo_list + ) + + cdef cugraph_coo_t* \ + cugraph_coo_list_element( + cugraph_coo_list_t* coo_list, + size_t index) + + cdef void \ + cugraph_coo_free( + cugraph_coo_t* coo + ) + + cdef void \ + cugraph_coo_list_free( + cugraph_coo_list_t* coo_list + ) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd index f6d62377443..cda47e55f77 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -31,62 +31,16 @@ from pylibcugraph._cugraph_c.random cimport ( cugraph_rng_state_t, ) +from pylibcugraph._cugraph_c.coo cimport ( + cugraph_coo_t, + cugraph_coo_list_t, +) + cdef extern from "cugraph_c/graph_generators.h": ctypedef enum cugraph_generator_distribution_t: POWER_LAW UNIFORM - ctypedef struct cugraph_coo_t: - pass - - ctypedef struct cugraph_coo_list_t: - pass - - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_coo_get_sources( - cugraph_coo_t* coo - ) - - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_coo_get_destinations( - cugraph_coo_t* coo - ) - - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_coo_get_edge_weights( - cugraph_coo_t* coo - ) - - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_coo_get_edge_id( - cugraph_coo_t* coo - ) - - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_coo_get_edge_type( - cugraph_coo_t* coo - ) - - cdef size_t \ - cugraph_coo_list_size( - const cugraph_coo_list_t* coo_list - ) - - cdef cugraph_coo_t* \ - cugraph_coo_list_element( - cugraph_coo_list_t* coo_list, - size_t index) - - cdef void \ - cugraph_coo_free( - cugraph_coo_t* coo - ) - - cdef void \ - cugraph_coo_list_free( - cugraph_coo_list_t* coo_list - ) - cdef cugraph_error_code_t \ cugraph_generate_rmat_edgelist( const cugraph_resource_handle_t* handle, diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd index 0f852d9cecd..4a707db03c5 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd @@ -39,7 +39,9 @@ from pylibcugraph._cugraph_c.random cimport ( from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_t, ) - +from pylibcugraph._cugraph_c.coo cimport ( + cugraph_coo_t, +) from pylibcugraph._cugraph_c.properties cimport ( cugraph_edge_property_view_t, ) @@ -103,3 +105,21 @@ cdef extern from "cugraph_c/sampling_algorithms.h": cugraph_type_erased_device_array_t** vertices, cugraph_error_t** error ) + + # negative sampling + cdef cugraph_error_code_t \ + cugraph_negative_sampling( + const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, + cugraph_graph_t* graph, + size_t num_samples, + const cugraph_type_erased_device_array_view_t* vertices, + const cugraph_type_erased_device_array_view_t* src_bias, + const cugraph_type_erased_device_array_view_t* dst_bias, + bool_t remove_duplicates, + bool_t remove_false_negatives, + bool_t exact_number_of_samples, + bool_t do_expensive_check, + cugraph_coo_t **result, + cugraph_error_t **error + ) diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx index f38ad21d3b0..4ea96920e61 100644 --- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx +++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx @@ -26,11 +26,7 @@ from pylibcugraph._cugraph_c.error cimport ( from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, ) -from pylibcugraph._cugraph_c.graph_generators cimport ( - cugraph_generate_rmat_edgelist, - cugraph_generate_edge_weights, - cugraph_generate_edge_ids, - cugraph_generate_edge_types, +from pylibcugraph._cugraph_c.coo cimport ( cugraph_coo_t, cugraph_coo_get_sources, cugraph_coo_get_destinations, @@ -39,6 +35,12 @@ from pylibcugraph._cugraph_c.graph_generators cimport ( cugraph_coo_get_edge_type, cugraph_coo_free, ) +from pylibcugraph._cugraph_c.graph_generators cimport ( + cugraph_generate_rmat_edgelist, + cugraph_generate_edge_weights, + cugraph_generate_edge_ids, + cugraph_generate_edge_types, +) from pylibcugraph.resource_handle cimport ( ResourceHandle, ) diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx index 32af0c13fc0..7de48708f80 100644 --- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx +++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx @@ -26,14 +26,9 @@ from pylibcugraph._cugraph_c.error cimport ( from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, ) -from pylibcugraph._cugraph_c.graph_generators cimport ( - cugraph_generate_rmat_edgelists, - cugraph_generate_edge_weights, - cugraph_generate_edge_ids, - cugraph_generate_edge_types, +from pylibcugraph._cugraph_c.coo cimport ( cugraph_coo_t, cugraph_coo_list_t, - cugraph_generator_distribution_t, cugraph_coo_get_sources, cugraph_coo_get_destinations, cugraph_coo_get_edge_weights, @@ -44,6 +39,13 @@ from pylibcugraph._cugraph_c.graph_generators cimport ( cugraph_coo_free, cugraph_coo_list_free, ) +from pylibcugraph._cugraph_c.graph_generators cimport ( + cugraph_generate_rmat_edgelists, + cugraph_generate_edge_weights, + cugraph_generate_edge_ids, + cugraph_generate_edge_types, + cugraph_generator_distribution_t, +) from pylibcugraph.resource_handle cimport ( ResourceHandle, ) diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml index 73699b6e445..727d05e8e52 100644 --- a/python/pylibcugraph/pyproject.toml +++ b/python/pylibcugraph/pyproject.toml @@ -23,6 +23,10 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ + "nvidia-cublas", + "nvidia-curand", + "nvidia-cusolver", + "nvidia-cusparse", "pylibraft==24.10.*,>=0.0.0a0", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -73,4 +77,4 @@ requires = [ "pylibraft==24.10.*,>=0.0.0a0", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. -matrix-entry = "cuda_suffixed=true" +matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"