wrap up sg API

rapidsai · Sep 20, 2023 · 3195298 · 3195298
1 parent 2bd93d9
commit 3195298
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 47 deletions.
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -28,7 +28,6 @@
 #include <cugraph/sampling_functions.hpp>
 
 #include <raft/core/handle.hpp>
-#include <iostream>
 
 namespace cugraph {
 namespace c_api {
@@ -219,19 +218,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
           options_.dedupe_sources_,
           do_expensive_check_);
 
-      std::cout << "has labels? " << has_labels << std::endl;
-      std::cout << "has offsets? " << (offsets.has_value()) << std::endl;
-
-      bool print=false;
-      if (offsets->size() < 10) {
-        print=true;
-        for(size_t k = 0; k < offsets->size(); ++k) std::cout << offsets->element(k, handle_.get_stream()) << " ";
-        std::cout << std::endl;
-
-        for(size_t k = 0; k < hop->size(); ++k) std::cout << hop->element(k, handle_.get_stream()) << " ";
-        std::cout << std::endl;
-      }
-
       std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
 
       cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
@@ -340,8 +326,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
           CUGRAPH_FAIL("Can only use COO format if not renumbering");
         }
 
-        std::cout << "offsets? " << offsets.has_value() << std::endl;
-
         std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
           cugraph::sort_sampled_edgelist(
             handle_,
@@ -365,12 +349,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
 
         hop.reset();
         offsets.reset();
-
-        if(print && label_hop_offsets) {
-          std::cout << "printing label_hop_offsets: ";
-          for(size_t k = 0; k < label_hop_offsets->size(); ++k) std::cout << label_hop_offsets->element(k, handle_.get_stream());
-          std::cout << std::endl;
-        }
       }
 
       result_ = new cugraph::c_api::cugraph_sample_result_t{

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -166,9 +166,12 @@ void check_input_edges(
                                               std::numeric_limits<label_index_t>::max()),
                   "Invalid input arguments: current implementation assumes that the number of "
                   "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
+  /*
   CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
                   "Invlaid input arguments: there should be 1 or more labels if "
                   "edgelist_label_offsets.has_value() is true.");
+  */
+
   CUGRAPH_EXPECTS(
     !edgelist_label_offsets.has_value() ||
       (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),

diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -294,6 +294,7 @@ def uniform_neighbor_sample(
                 start_list = G.lookup_internal_vertex_id(start_list, columns)
             start_list = start_list.rename(columns={columns[0]: start_col_name})
 
+
     sampling_result = pylibcugraph_uniform_neighbor_sample(
         resource_handle=ResourceHandle(),
         input_graph=G._plc_graph,
@@ -343,21 +344,33 @@ def uniform_neighbor_sample(
             })
 
             if not return_offsets:
-                batch_ids_r = cudf.Series(batch_ids).repeat(
-                    cp.diff(sampling_result['renumber_map_offsets'][:-1])
-                )
-                batch_ids_r.reset_index(drop=True, inplace=True)
-                renumber_df["batch_id"] = batch_ids_r
+                if len(batch_ids) > 0:
+                    print(batch_ids)
+                    print(sampling_result['renumber_map_offsets'])
+                    batch_ids_r = cudf.Series(batch_ids).repeat(
+                        cp.diff(sampling_result['renumber_map_offsets'])
+                    )
+                    batch_ids_r.reset_index(drop=True, inplace=True)
+                    renumber_df["batch_id"] = batch_ids_r
+                else:
+                    renumber_df['batch_id'] = None
 
         if return_offsets:
             batches_series = cudf.Series(
                 batch_ids,
                 name="batch_id",
             )
-            offsets_df = cudf.Series(
-                label_hop_offsets,
-                name="offsets",
-            ).to_frame()
+            if include_hop_column:
+                # TODO remove this logic in release 23.12
+                offsets_df = cudf.Series(
+                    label_hop_offsets[cp.arange(len(batch_ids)+1) * len(fanout_vals)],
+                    name='offsets',
+                ).to_frame()
+            else:
+                offsets_df = cudf.Series(
+                    label_hop_offsets,
+                    name="offsets",
+                ).to_frame()
 
             if len(batches_series) > len(offsets_df):
                 # this is extremely rare so the inefficiency is ok
@@ -376,23 +389,34 @@ def uniform_neighbor_sample(
                     renumber_df = renumber_df.join(renumber_offset_series, how='outer').sort_index()
                 else:
                     renumber_df['renumber_map_offsets'] = renumber_offset_series
-
-            if include_hop_column:
-                print(batch_ids)
-                print(label_hop_offsets)
-                raise ValueError("asdf")
 
         else:
             if len(batch_ids) > 0:
-                if renumber: # FIXME change this once Seunghwa updates the sampling API
-                    batch_ids = cudf.Series(cp.repeat(batch_ids, len(fanout_vals)))
-
-                batch_ids = cudf.Series(batch_ids).repeat(cp.diff(label_hop_offsets))
-                batch_ids.reset_index(drop=True, inplace=True)
-                print('output batch ids:', batch_ids)
-
-            results_df["batch_id"] = batch_ids
+                batch_ids_r = cudf.Series(cp.repeat(batch_ids, len(fanout_vals)))
+                batch_ids_r = cudf.Series(batch_ids_r).repeat(cp.diff(label_hop_offsets))                    
+                batch_ids_r.reset_index(drop=True, inplace=True)
+
+                results_df["batch_id"] = batch_ids_r
+            else:
+                results_df['batch_id'] = None
 
+        # TODO remove this logic in release 23.12, hops will always returned as offsets
+        if include_hop_column:
+            if len(batch_ids) > 0:
+                hop_ids_r = cudf.Series(cp.arange(len(fanout_vals)))
+                hop_ids_r = cudf.concat([hop_ids_r] * len(batch_ids),ignore_index=True)
+                print(len(hop_ids_r))
+                print(len(label_hop_offsets))
+
+                # generate the hop column
+                hop_ids_r = cudf.Series(hop_ids_r, name='hop_id').repeat(
+                    cp.diff(label_hop_offsets)
+                ).reset_index(drop=True)
+            else:
+                hop_ids_r = cudf.Series(name='hop_id', dtype='int32')
+
+            results_df = results_df.join(hop_ids_r, how='outer').sort_index()
+
         if major_col_name not in results_df:
             if use_legacy_names:
                 raise ValueError("Can't use legacy names with major offsets")

diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -308,7 +308,8 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out
 @pytest.mark.sg
 @pytest.mark.cugraph_ops
 @pytest.mark.parametrize("return_offsets", [True, False])
-def test_uniform_neighbor_sample_edge_properties(return_offsets):
+@pytest.mark.parametrize("include_hop_column", [True, False])
+def test_uniform_neighbor_sample_edge_properties(return_offsets, include_hop_column):
     edgelist_df = cudf.DataFrame(
         {
             "src": cudf.Series([0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2], dtype="int32"),
@@ -342,6 +343,7 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
         with_edge_properties=True,
         with_batch_ids=True,
         return_offsets=return_offsets,
+        include_hop_column=include_hop_column
     )
     if return_offsets:
         sampling_results, sampling_offsets = sampling_results
@@ -364,11 +366,17 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
         == sampling_results["destinations"].values_host.tolist()
     )
 
-    assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2)
+    if include_hop_column:
+        assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2)
+    else:
+        assert 'hop_id' not in sampling_results
 
     if return_offsets:
         assert sampling_offsets["batch_id"].dropna().values_host.tolist() == [0, 1]
-        assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12]
+        if include_hop_column:
+            assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12]
+        else:
+            assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 2, 6, 8, 12]
     else:
         assert sampling_results["batch_id"].values_host.tolist() == ([0] * 6 + [1] * 6)