fix issue with num nodes and edges

rapidsai · Sep 27, 2023 · 13be49c · 13be49c
1 parent b369e97
commit 13be49c
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 58 deletions.
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -880,9 +880,7 @@ def _get_renumbered_edge_groups_from_sample(
                 # Step 1: get the index of the new id
                 ix_c = torch.searchsorted(
                     torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(
-                        sampling_results.minors.values, device="cuda"
-                    ),
+                    torch.as_tensor(sampling_results.minors.values, device="cuda"),
                 )
                 # Step 2: Go from id indices to actual ids
                 col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[

diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -60,7 +60,7 @@ def __init__(
         # Sampler args
         num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
         replace: bool = True,
-        compression: str = 'COO',
+        compression: str = "COO",
         # Other kwargs for the BulkSampler
         **kwargs,
     ):
@@ -179,8 +179,8 @@ def __init__(
             renumber=renumber,
             use_legacy_names=False,
             deduplicate_sources=True,
-            prior_sources_behavior='exclude',
-            include_hop_column = (compression == 'COO'),
+            prior_sources_behavior="exclude",
+            include_hop_column=(compression == "COO"),
             **kwargs,
         )
 
@@ -253,7 +253,7 @@ def __next__(self):
             )
 
             raw_sample_data = cudf.read_parquet(parquet_path)
-            
+
             if "map" in raw_sample_data.columns:
                 if "renumber_map_offsets" not in raw_sample_data.columns:
                     num_batches = end_inclusive - self.__start_inclusive + 1
@@ -268,21 +268,29 @@ def __next__(self):
                     self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
                     self.__renumber_map = map[num_batches + 1 :]
                 else:
-                    self.__renumber_map = raw_sample_data['map']
-                    self.__renumber_map_offsets = raw_sample_data['renumber_map_offsets']
-                    raw_sample_data.drop(columns=['map', 'renumber_map_offsets'], inplace=True)
-
+                    self.__renumber_map = raw_sample_data["map"]
+                    self.__renumber_map_offsets = raw_sample_data[
+                        "renumber_map_offsets"
+                    ]
+                    raw_sample_data.drop(
+                        columns=["map", "renumber_map_offsets"], inplace=True
+                    )
+
                     self.__renumber_map.dropna(inplace=True)
-                    self.__renumber_map = torch.as_tensor(self.__renumber_map, device='cuda')
+                    self.__renumber_map = torch.as_tensor(
+                        self.__renumber_map, device="cuda"
+                    )
 
                     self.__renumber_map_offsets.dropna(inplace=True)
-                    self.__renumber_map_offsets = torch.as_tensor(self.__renumber_map_offsets, device='cuda')
+                    self.__renumber_map_offsets = torch.as_tensor(
+                        self.__renumber_map_offsets, device="cuda"
+                    )
 
             else:
                 self.__renumber_map = None
 
             self.__data = raw_sample_data
-            self.__coo = ('majors' in self.__data.columns)
+            self.__coo = "majors" in self.__data.columns
             if self.__coo:
                 self.__data.dropna(inplace=True)
 
@@ -292,33 +300,37 @@ def __next__(self):
             ):
                 if self.__coo:
                     group_cols = ["batch_id", "hop_id"]
-                    self.__data_index = self.__data.groupby(group_cols, as_index=True).agg(
-                        {"majors": "max", "minors": "max"}
-                    )
+                    self.__data_index = self.__data.groupby(
+                        group_cols, as_index=True
+                    ).agg({"majors": "max", "minors": "max"})
                     self.__data_index.rename(
                         columns={"majors": "src_max", "minors": "dst_max"},
                         inplace=True,
                     )
                     self.__data_index = self.__data_index.to_dict(orient="index")
                 else:
                     self.__data_index = None
-                    
-                    self.__label_hop_offsets = self.__data['label_hop_offsets']
-                    self.__data.drop(columns=['label_hop_offsets'], inplace=True)
+
+                    self.__label_hop_offsets = self.__data["label_hop_offsets"]
+                    self.__data.drop(columns=["label_hop_offsets"], inplace=True)
                     self.__label_hop_offsets.dropna(inplace=True)
-                    self.__label_hop_offsets = torch.as_tensor(self.__label_hop_offsets, device='cuda')
+                    self.__label_hop_offsets = torch.as_tensor(
+                        self.__label_hop_offsets, device="cuda"
+                    )
                     self.__label_hop_offsets -= self.__label_hop_offsets[0].clone()
 
-                    self.__major_offsets = self.__data['major_offsets']
-                    self.__data.drop(columns='major_offsets', inplace=True)
+                    self.__major_offsets = self.__data["major_offsets"]
+                    self.__data.drop(columns="major_offsets", inplace=True)
                     self.__major_offsets.dropna(inplace=True)
-                    self.__major_offsets = torch.as_tensor(self.__major_offsets, device='cuda')
+                    self.__major_offsets = torch.as_tensor(
+                        self.__major_offsets, device="cuda"
+                    )
                     self.__major_offsets -= self.__major_offsets[0].clone()
 
-                    self.__minors = self.__data['minors']
-                    self.__data.drop(columns='minors', inplace=True)
+                    self.__minors = self.__data["minors"]
+                    self.__data.drop(columns="minors", inplace=True)
                     self.__minors.dropna(inplace=True)
-                    self.__minors = torch.tensor(self.__minors, device='cuda')
+                    self.__minors = torch.tensor(self.__minors, device="cuda")
 
                     num_batches = self.__end_exclusive - self.__start_inclusive
                     offsets_len = len(self.__label_hop_offsets) - 1
@@ -355,9 +367,15 @@ def __next__(self):
                 )
             else:
                 i = (self.__next_batch - self.__start_inclusive) * self.__fanout_length
-                current_label_hop_offsets = self.__label_hop_offsets[i : i + self.__fanout_length + 1]
-                current_major_offsets = self.__major_offsets[current_label_hop_offsets[0] : current_label_hop_offsets[-1]+1]
-                current_minors = self.__minors[current_major_offsets[0] : current_major_offsets[-1] + 1]
+                current_label_hop_offsets = self.__label_hop_offsets[
+                    i : i + self.__fanout_length + 1
+                ]
+                current_major_offsets = self.__major_offsets[
+                    current_label_hop_offsets[0] : current_label_hop_offsets[-1] + 1
+                ]
+                current_minors = self.__minors[
+                    current_major_offsets[0] : current_major_offsets[-1] + 1
+                ]
 
                 sampler_output = _sampler_output_from_sampling_results_homogeneous_csr(
                     current_major_offsets,

diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -222,9 +222,11 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
 
     major_offsets = major_offsets.clone() - major_offsets[0]
 
-    num_nodes_per_hop_dict = {node_type: label_hop_offsets.diff()}
-    num_edges_per_hop_dict = {edge_type: major_offsets[1:]}
-
+    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().cpu()}
+
+    label_hop_offsets = label_hop_offsets.cpu()
+    num_nodes_per_hop_dict = {node_type: torch.concat([label_hop_offsets.diff(), (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,))])}
+
     noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
 
     col_dict = {
@@ -407,7 +409,12 @@ def filter_cugraph_store_csc(
     for attr in graph_store.get_all_edge_attrs():
         key = attr.edge_type
         if key in row_dict and key in col_dict:
-            data.put_edge_index((row_dict[key], col_dict[key]), edge_type=key, layout='csc', is_sorted=True)
+            data.put_edge_index(
+                (row_dict[key], col_dict[key]),
+                edge_type=key,
+                layout="csc",
+                is_sorted=True,
+            )
 
     required_attrs = []
     for attr in feature_store.get_all_tensor_attrs():
@@ -420,4 +427,4 @@ def filter_cugraph_store_csc(
     for i, attr in enumerate(required_attrs):
         data[attr.group_name][attr.attr_name] = tensors[i]
 
-    return data
+    return data
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -130,12 +130,10 @@ def test_cugraph_loader_from_disk():
         assert list(edge_index.shape) == [2, 8]
 
         assert (
-            edge_index[0].tolist()
-            == bogus_samples.majors.dropna().values_host.tolist()
+            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
         )
         assert (
-            edge_index[1].tolist()
-            == bogus_samples.minors.dropna().values_host.tolist()
+            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
         )
 
     assert num_samples == 256
@@ -190,12 +188,10 @@ def test_cugraph_loader_from_disk_subset():
         assert list(edge_index.shape) == [2, 8]
 
         assert (
-            edge_index[0].tolist()
-            == bogus_samples.majors.dropna().values_host.tolist()
+            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
         )
         assert (
-            edge_index[1].tolist()
-            == bogus_samples.minors.dropna().values_host.tolist()
+            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
         )
 
     assert num_samples == 100
@@ -221,12 +217,14 @@ def test_cugraph_loader_from_disk_subset_csr():
             "minors": [1, 2, 3, 0, 3, 4, 5, 1],
             "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
             "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "label_hop_offsets": cudf.Series([0, 1, 4, None, None, None, None, None], dtype='int32'),
-            'renumber_map_offsets': cudf.Series([0, 6], dtype='int32'),
+            "label_hop_offsets": cudf.Series(
+                [0, 1, 4, None, None, None, None, None], dtype="int32"
+            ),
+            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
         }
     )
     map = cudf.Series(m, name="map")
-    bogus_samples['map'] = map
+    bogus_samples["map"] = map
 
     tempdir = tempfile.TemporaryDirectory()
     for s in range(256):
@@ -248,22 +246,19 @@ def test_cugraph_loader_from_disk_subset_csr():
         # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
         assert sample["t0"]["x"].tolist() == [1, 2, 3, 4, 5, 6]
 
-        print(list(sample[("t0", "knows", "t0")].keys()))
         edge_index = sample[("t0", "knows", "t0")]["adj_t"]
-        print(edge_index)
         assert edge_index.size(0) == 4
         assert edge_index.size(1) == 6
 
         colptr, row, _ = edge_index.csr()
 
         assert (
-            colptr.tolist()
-            == bogus_samples.major_offsets.dropna().values_host.tolist()
-        )
-        assert (
-            row.tolist()
-            == bogus_samples.minors.dropna().values_host.tolist()
+            colptr.tolist() == bogus_samples.major_offsets.dropna().values_host.tolist()
         )
+        assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
+
+        #assert sample['t0']['num_sampled_nodes'].tolist() == [1, 3, 2]
+        assert sample['t0','knows','t0']['num_sampled_edges'].tolist() == [3, 5]
 
     assert num_samples == 100
 
@@ -320,8 +315,6 @@ def test_cugraph_loader_e2e_coo():
         num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
         num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
 
-        print(num_sampled_nodes, num_sampled_edges)
-
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
 
@@ -330,9 +323,7 @@ def test_cugraph_loader_e2e_coo():
             x = convs[i](x, ei, size=(s, s))
             x = relu(x)
             x = dropout(x, p=0.5)
-            print(x.shape)
 
-        print(x.shape)
         x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
 
         assert list(x.shape) == [3, 1]