Skip to content

Commit

Permalink
fix issue with num nodes and edges
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarghi-nv committed Sep 27, 2023
1 parent b369e97 commit 13be49c
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 58 deletions.
4 changes: 1 addition & 3 deletions python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,9 +880,7 @@ def _get_renumbered_edge_groups_from_sample(
# Step 1: get the index of the new id
ix_c = torch.searchsorted(
torch.as_tensor(id_map.index.values, device="cuda"),
torch.as_tensor(
sampling_results.minors.values, device="cuda"
),
torch.as_tensor(sampling_results.minors.values, device="cuda"),
)
# Step 2: Go from id indices to actual ids
col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
Expand Down
72 changes: 45 additions & 27 deletions python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(
# Sampler args
num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
replace: bool = True,
compression: str = 'COO',
compression: str = "COO",
# Other kwargs for the BulkSampler
**kwargs,
):
Expand Down Expand Up @@ -179,8 +179,8 @@ def __init__(
renumber=renumber,
use_legacy_names=False,
deduplicate_sources=True,
prior_sources_behavior='exclude',
include_hop_column = (compression == 'COO'),
prior_sources_behavior="exclude",
include_hop_column=(compression == "COO"),
**kwargs,
)

Expand Down Expand Up @@ -253,7 +253,7 @@ def __next__(self):
)

raw_sample_data = cudf.read_parquet(parquet_path)

if "map" in raw_sample_data.columns:
if "renumber_map_offsets" not in raw_sample_data.columns:
num_batches = end_inclusive - self.__start_inclusive + 1
Expand All @@ -268,21 +268,29 @@ def __next__(self):
self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
self.__renumber_map = map[num_batches + 1 :]
else:
self.__renumber_map = raw_sample_data['map']
self.__renumber_map_offsets = raw_sample_data['renumber_map_offsets']
raw_sample_data.drop(columns=['map', 'renumber_map_offsets'], inplace=True)

self.__renumber_map = raw_sample_data["map"]
self.__renumber_map_offsets = raw_sample_data[
"renumber_map_offsets"
]
raw_sample_data.drop(
columns=["map", "renumber_map_offsets"], inplace=True
)

self.__renumber_map.dropna(inplace=True)
self.__renumber_map = torch.as_tensor(self.__renumber_map, device='cuda')
self.__renumber_map = torch.as_tensor(
self.__renumber_map, device="cuda"
)

self.__renumber_map_offsets.dropna(inplace=True)
self.__renumber_map_offsets = torch.as_tensor(self.__renumber_map_offsets, device='cuda')
self.__renumber_map_offsets = torch.as_tensor(
self.__renumber_map_offsets, device="cuda"
)

else:
self.__renumber_map = None

self.__data = raw_sample_data
self.__coo = ('majors' in self.__data.columns)
self.__coo = "majors" in self.__data.columns
if self.__coo:
self.__data.dropna(inplace=True)

Expand All @@ -292,33 +300,37 @@ def __next__(self):
):
if self.__coo:
group_cols = ["batch_id", "hop_id"]
self.__data_index = self.__data.groupby(group_cols, as_index=True).agg(
{"majors": "max", "minors": "max"}
)
self.__data_index = self.__data.groupby(
group_cols, as_index=True
).agg({"majors": "max", "minors": "max"})
self.__data_index.rename(
columns={"majors": "src_max", "minors": "dst_max"},
inplace=True,
)
self.__data_index = self.__data_index.to_dict(orient="index")
else:
self.__data_index = None
self.__label_hop_offsets = self.__data['label_hop_offsets']
self.__data.drop(columns=['label_hop_offsets'], inplace=True)

self.__label_hop_offsets = self.__data["label_hop_offsets"]
self.__data.drop(columns=["label_hop_offsets"], inplace=True)
self.__label_hop_offsets.dropna(inplace=True)
self.__label_hop_offsets = torch.as_tensor(self.__label_hop_offsets, device='cuda')
self.__label_hop_offsets = torch.as_tensor(
self.__label_hop_offsets, device="cuda"
)
self.__label_hop_offsets -= self.__label_hop_offsets[0].clone()

self.__major_offsets = self.__data['major_offsets']
self.__data.drop(columns='major_offsets', inplace=True)
self.__major_offsets = self.__data["major_offsets"]
self.__data.drop(columns="major_offsets", inplace=True)
self.__major_offsets.dropna(inplace=True)
self.__major_offsets = torch.as_tensor(self.__major_offsets, device='cuda')
self.__major_offsets = torch.as_tensor(
self.__major_offsets, device="cuda"
)
self.__major_offsets -= self.__major_offsets[0].clone()

self.__minors = self.__data['minors']
self.__data.drop(columns='minors', inplace=True)
self.__minors = self.__data["minors"]
self.__data.drop(columns="minors", inplace=True)
self.__minors.dropna(inplace=True)
self.__minors = torch.tensor(self.__minors, device='cuda')
self.__minors = torch.tensor(self.__minors, device="cuda")

num_batches = self.__end_exclusive - self.__start_inclusive
offsets_len = len(self.__label_hop_offsets) - 1
Expand Down Expand Up @@ -355,9 +367,15 @@ def __next__(self):
)
else:
i = (self.__next_batch - self.__start_inclusive) * self.__fanout_length
current_label_hop_offsets = self.__label_hop_offsets[i : i + self.__fanout_length + 1]
current_major_offsets = self.__major_offsets[current_label_hop_offsets[0] : current_label_hop_offsets[-1]+1]
current_minors = self.__minors[current_major_offsets[0] : current_major_offsets[-1] + 1]
current_label_hop_offsets = self.__label_hop_offsets[
i : i + self.__fanout_length + 1
]
current_major_offsets = self.__major_offsets[
current_label_hop_offsets[0] : current_label_hop_offsets[-1] + 1
]
current_minors = self.__minors[
current_major_offsets[0] : current_major_offsets[-1] + 1
]

sampler_output = _sampler_output_from_sampling_results_homogeneous_csr(
current_major_offsets,
Expand Down
17 changes: 12 additions & 5 deletions python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,11 @@ def _sampler_output_from_sampling_results_homogeneous_csr(

major_offsets = major_offsets.clone() - major_offsets[0]

num_nodes_per_hop_dict = {node_type: label_hop_offsets.diff()}
num_edges_per_hop_dict = {edge_type: major_offsets[1:]}

num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().cpu()}

label_hop_offsets = label_hop_offsets.cpu()
num_nodes_per_hop_dict = {node_type: torch.concat([label_hop_offsets.diff(), (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,))])}

noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}

col_dict = {
Expand Down Expand Up @@ -407,7 +409,12 @@ def filter_cugraph_store_csc(
for attr in graph_store.get_all_edge_attrs():
key = attr.edge_type
if key in row_dict and key in col_dict:
data.put_edge_index((row_dict[key], col_dict[key]), edge_type=key, layout='csc', is_sorted=True)
data.put_edge_index(
(row_dict[key], col_dict[key]),
edge_type=key,
layout="csc",
is_sorted=True,
)

required_attrs = []
for attr in feature_store.get_all_tensor_attrs():
Expand All @@ -420,4 +427,4 @@ def filter_cugraph_store_csc(
for i, attr in enumerate(required_attrs):
data[attr.group_name][attr.attr_name] = tensors[i]

return data
return data
37 changes: 14 additions & 23 deletions python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,10 @@ def test_cugraph_loader_from_disk():
assert list(edge_index.shape) == [2, 8]

assert (
edge_index[0].tolist()
== bogus_samples.majors.dropna().values_host.tolist()
edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
)
assert (
edge_index[1].tolist()
== bogus_samples.minors.dropna().values_host.tolist()
edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
)

assert num_samples == 256
Expand Down Expand Up @@ -190,12 +188,10 @@ def test_cugraph_loader_from_disk_subset():
assert list(edge_index.shape) == [2, 8]

assert (
edge_index[0].tolist()
== bogus_samples.majors.dropna().values_host.tolist()
edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
)
assert (
edge_index[1].tolist()
== bogus_samples.minors.dropna().values_host.tolist()
edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
)

assert num_samples == 100
Expand All @@ -221,12 +217,14 @@ def test_cugraph_loader_from_disk_subset_csr():
"minors": [1, 2, 3, 0, 3, 4, 5, 1],
"edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
"edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
"label_hop_offsets": cudf.Series([0, 1, 4, None, None, None, None, None], dtype='int32'),
'renumber_map_offsets': cudf.Series([0, 6], dtype='int32'),
"label_hop_offsets": cudf.Series(
[0, 1, 4, None, None, None, None, None], dtype="int32"
),
"renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
}
)
map = cudf.Series(m, name="map")
bogus_samples['map'] = map
bogus_samples["map"] = map

tempdir = tempfile.TemporaryDirectory()
for s in range(256):
Expand All @@ -248,22 +246,19 @@ def test_cugraph_loader_from_disk_subset_csr():
# correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
assert sample["t0"]["x"].tolist() == [1, 2, 3, 4, 5, 6]

print(list(sample[("t0", "knows", "t0")].keys()))
edge_index = sample[("t0", "knows", "t0")]["adj_t"]
print(edge_index)
assert edge_index.size(0) == 4
assert edge_index.size(1) == 6

colptr, row, _ = edge_index.csr()

assert (
colptr.tolist()
== bogus_samples.major_offsets.dropna().values_host.tolist()
)
assert (
row.tolist()
== bogus_samples.minors.dropna().values_host.tolist()
colptr.tolist() == bogus_samples.major_offsets.dropna().values_host.tolist()
)
assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()

#assert sample['t0']['num_sampled_nodes'].tolist() == [1, 3, 2]
assert sample['t0','knows','t0']['num_sampled_edges'].tolist() == [3, 5]

assert num_samples == 100

Expand Down Expand Up @@ -320,8 +315,6 @@ def test_cugraph_loader_e2e_coo():
num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]

print(num_sampled_nodes, num_sampled_edges)

for i in range(len(convs)):
x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)

Expand All @@ -330,9 +323,7 @@ def test_cugraph_loader_e2e_coo():
x = convs[i](x, ei, size=(s, s))
x = relu(x)
x = dropout(x, p=0.5)
print(x.shape)

print(x.shape)
x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])

assert list(x.shape) == [3, 1]

0 comments on commit 13be49c

Please sign in to comment.