Skip to content

Commit

Permalink
add e2e test
Browse files Browse the repository at this point in the history
  • Loading branch information
jalencato committed Dec 11, 2023
1 parent 3de2100 commit 98038fd
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 4 deletions.
6 changes: 3 additions & 3 deletions python/graphstorm/gconstruct/construct_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def process_edge_data(process_confs, node_id_map, arr_merger,

return (edges, edge_data, label_stats)

def verify_confs(confs, args=None):
def verify_confs(confs, rev_edges):
""" Verify the configuration of the input data.
"""
if "version" not in confs:
Expand All @@ -592,7 +592,7 @@ def verify_confs(confs, args=None):
ntypes = {conf['node_type'] for conf in confs["nodes"]}
etypes = [conf['relation'] for conf in confs["edges"]]
# Adjust input to DGL requirement if it is a honogeneous graph
if len(ntypes) == 1 and len(etypes) == 1 and not args.add_reverse_edges:
if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges:
assert etypes[0][0] in ntypes, \
f"source node type {etypes[0][0]} does not exist. Please check your input data."
assert etypes[0][2] in ntypes, \
Expand Down Expand Up @@ -678,7 +678,7 @@ def process_graph(args):
if args.num_processes_for_nodes is not None else args.num_processes
num_processes_for_edges = args.num_processes_for_edges \
if args.num_processes_for_edges is not None else args.num_processes
verify_confs(process_confs, args)
verify_confs(process_confs, args.add_reverse_edges)
output_format = args.output_format
for out_format in output_format:
assert out_format in ["DGL", "DistDGL"], \
Expand Down
44 changes: 44 additions & 0 deletions tests/end2end-tests/data_gen/movielens_homo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"version": "gconstruct-v0.1",
"nodes": [
{
"node_id_col": "id",
"node_type": "movie",
"format": {"name": "parquet"},
"files": "/data/ml-100k/movie.parquet",
"features": [
{
"feature_col": "title",
"transform": {
"name": "bert_hf",
"bert_model": "bert-base-uncased",
"max_seq_length": 16
}
}
],
"labels": [
{
"label_col": "label",
"task_type": "classification",
"split_pct": [0.8, 0.1, 0.1]
}
]
}
],
"edges": [
{
"source_id_col": "src_id",
"dest_id_col": "dst_id",
"relation": ["movie", "rating", "movie"],
"format": {"name": "parquet"},
"files": "/data/ml-100k/edges_homo.parquet",
"labels": [
{
"label_col": "rate",
"task_type": "classification",
"split_pct": [0.1, 0.1, 0.1]
}
]
}
]
}
5 changes: 5 additions & 0 deletions tests/end2end-tests/data_gen/process_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def write_data_parquet(data, data_file):
edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]}
write_data_parquet(edge_data, '/data/ml-100k/edges.parquet')

# generate data for homogeneous optimization test
edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None)
edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]}
write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet')

# generate synthetic user data with label
user_labels = np.random.randint(11, size=feat.shape[0])
user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels}
Expand Down
10 changes: 9 additions & 1 deletion tests/end2end-tests/data_process/movielens_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ service ssh restart
GS_HOME=$(pwd)
NUM_TRAINERS=4
export PYTHONPATH=$GS_HOME/python/
cd $GS_HOME/training_scripts/gsgnn_np
echo "127.0.0.1" > ip_list.txt
cd $GS_HOME/training_scripts/gsgnn_ep

echo "127.0.0.1" > ip_list.txt

error_and_exit () {
Expand All @@ -27,6 +28,13 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2

error_and_exit $?

echo "********* Test Homogeneous Graph Optimization ********"
python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k
error_and_exit $?

python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N
error_and_exit $?

echo "********* Test the DistDGL graph format with BERT embeddings ********"
python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges

Expand Down

0 comments on commit 98038fd

Please sign in to comment.