Skip to content

Commit

Permalink
Add more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiang Song committed Dec 22, 2023
1 parent 09527ba commit cc203c3
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 2 deletions.
6 changes: 5 additions & 1 deletion python/graphstorm/gconstruct/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import glob
import json
import os
import logging

import pyarrow.parquet as pq
import pyarrow as pa
Expand Down Expand Up @@ -222,7 +223,10 @@ def read_data_parquet(data_file, data_fields=None):
d = np.stack(new_d)
except Exception: # pylint: disable=broad-exception-caught
# keep it as an ndarry of ndarrys
# It may happen loading hard negatives for hard negative transform.
# It may happen when loading hard negatives for hard negative transformation.
logging.warning("The %s column of parquet file %s has " \
"variable length of feature, it is only suported when " \
"transformation is a hard negative transformation", key, data_file)
pass
data[key] = d
return data
Expand Down
22 changes: 21 additions & 1 deletion tests/end2end-tests/data_process/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import json
import os
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np

from graphstorm.gconstruct.file_io import write_data_parquet, write_data_json, write_data_csv
Expand Down Expand Up @@ -121,6 +121,10 @@ def gen_rand_nid(max_nid, num_nodes):
edge_data3_2 = {
'data': src3 + node_id3[dst_idx],
}
edge_data3_3 = {
'data': [[nid, nid] for nid in dst3]
}
edge_data3_3['data'][0] = edge_data3_3['data'][0] + edge_data3_3['data'][0]

in_dir = '/tmp/test_data/'
out_dir = '/tmp/test_out/'
Expand Down Expand Up @@ -153,6 +157,10 @@ def split_data(data, num):
write_data_hdf5(edge_data1_2, os.path.join(in_dir, f'edge_data1_2.hdf5'))
for i, edge_data in enumerate(split_data(edge_data3, 10)):
write_data_parquet(edge_data, os.path.join(in_dir, f'edge_data3_{i}.parquet'))
df = pd.DataFrame(edge_data3_3)
df.to_parquet(os.path.join(in_dir,
f'ng_edge_data3.parquet'))

write_data_hdf5(edge_data3_2, os.path.join(in_dir, f'edge_data3_2.hdf5'))


Expand Down Expand Up @@ -383,6 +391,18 @@ def split_data(data, num):
}
],
},
{
"relation": ("node2", "relation3", "node3"),
"format": {"name": "parquet"},
"files": os.path.join(in_dir, "ng_edge_data3.parquet"),
"features": [
{
"feature_col": "data",
"feature_name": "hard_neg2",
"transform": {"name": "edge_dst_hard_negative"}
},
],
},
{
"relation": ("node2", "relation3", "node3"),
"format": {"name": "hdf5"},
Expand Down
7 changes: 7 additions & 0 deletions tests/end2end-tests/data_process/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,10 @@ def read_data_parquet(data_file):
_, dst_ids = g.edges(etype=("node2", "relation3", "node3"))
ground_truth = th.cat((dst_ids.reshape(-1,1), dst_ids.reshape(-1,1)), dim=1)
assert th.sum(hard_neg-ground_truth) == 0

hard_neg = g.edges[("node2", "relation3", "node3")].data["hard_neg2"]
_, dst_ids = g.edges(etype=("node2", "relation3", "node3"))
ground_truth = th.cat([dst_ids.reshape(-1,1), dst_ids.reshape(-1,1), th.full((dst_ids.shape[0], 2), -1, dtype=dst_ids.dtype)], dim=1)
ground_truth[0][2] = dst_ids[0]
ground_truth[0][3] = dst_ids[0]
assert th.sum(hard_neg-ground_truth) == 0

0 comments on commit cc203c3

Please sign in to comment.