diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index bf3cb6bcb6..42a63a8fac 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -582,7 +582,7 @@ def process_edge_data(process_confs, node_id_map, arr_merger, return (edges, edge_data, label_stats) -def verify_confs(confs, args=None): +def verify_confs(confs, rev_edges): """ Verify the configuration of the input data. """ if "version" not in confs: @@ -592,7 +592,7 @@ def verify_confs(confs, args=None): ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] # Adjust input to DGL requirement if it is a honogeneous graph - if len(ntypes) == 1 and len(etypes) == 1 and not args.add_reverse_edges: + if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges: assert etypes[0][0] in ntypes, \ f"source node type {etypes[0][0]} does not exist. Please check your input data." assert etypes[0][2] in ntypes, \ @@ -678,7 +678,7 @@ def process_graph(args): if args.num_processes_for_nodes is not None else args.num_processes num_processes_for_edges = args.num_processes_for_edges \ if args.num_processes_for_edges is not None else args.num_processes - verify_confs(process_confs, args) + verify_confs(process_confs, args.add_reverse_edges) output_format = args.output_format for out_format in output_format: assert out_format in ["DGL", "DistDGL"], \ diff --git a/tests/end2end-tests/data_gen/movielens_homo.json b/tests/end2end-tests/data_gen/movielens_homo.json new file mode 100644 index 0000000000..f64d4a0c2f --- /dev/null +++ b/tests/end2end-tests/data_gen/movielens_homo.json @@ -0,0 +1,44 @@ +{ + "version": "gconstruct-v0.1", + "nodes": [ + { + "node_id_col": "id", + "node_type": "movie", + "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", + "features": [ + { + "feature_col": "title", + "transform": { + "name": "bert_hf", + "bert_model": "bert-base-uncased", + "max_seq_length": 16 + } + } + ], + "labels": [ + { + "label_col": "label", + "task_type": "classification", + "split_pct": [0.8, 0.1, 0.1] + } + ] + } + ], + "edges": [ + { + "source_id_col": "src_id", + "dest_id_col": "dst_id", + "relation": ["movie", "rating", "movie"], + "format": {"name": "parquet"}, + "files": "/data/ml-100k/edges_homo.parquet", + "labels": [ + { + "label_col": "rate", + "task_type": "classification", + "split_pct": [0.1, 0.1, 0.1] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/end2end-tests/data_gen/process_movielens.py b/tests/end2end-tests/data_gen/process_movielens.py index 90fdcd1702..9ecc34de35 100644 --- a/tests/end2end-tests/data_gen/process_movielens.py +++ b/tests/end2end-tests/data_gen/process_movielens.py @@ -90,6 +90,11 @@ def write_data_parquet(data, data_file): edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]} write_data_parquet(edge_data, '/data/ml-100k/edges.parquet') +# generate data for homogeneous optimization test +edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None) +edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]} +write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet') + # generate synthetic user data with label user_labels = np.random.randint(11, size=feat.shape[0]) user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels} diff --git a/tests/end2end-tests/data_process/movielens_test.sh b/tests/end2end-tests/data_process/movielens_test.sh index 455330bc38..e47a7aaac9 100644 --- a/tests/end2end-tests/data_process/movielens_test.sh +++ b/tests/end2end-tests/data_process/movielens_test.sh @@ -5,8 +5,9 @@ service ssh restart GS_HOME=$(pwd) NUM_TRAINERS=4 export PYTHONPATH=$GS_HOME/python/ +cd $GS_HOME/training_scripts/gsgnn_np +echo "127.0.0.1" > ip_list.txt cd $GS_HOME/training_scripts/gsgnn_ep - echo "127.0.0.1" > ip_list.txt error_and_exit () { @@ -27,6 +28,13 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2 error_and_exit $? +echo "********* Test Homogeneous Graph Optimization ********" +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k +error_and_exit $? + +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N +error_and_exit $? + echo "********* Test the DistDGL graph format with BERT embeddings ********" python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges