From ea67d08cde4060ba53eb31281a943b23029e83dd Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Dec 2023 20:16:29 +0000 Subject: [PATCH 01/21] add optimization for gconstruct --- python/graphstorm/gconstruct/construct_graph.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 0065da5403..bf3cb6bcb6 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -582,7 +582,7 @@ def process_edge_data(process_confs, node_id_map, arr_merger, return (edges, edge_data, label_stats) -def verify_confs(confs): +def verify_confs(confs, args=None): """ Verify the configuration of the input data. """ if "version" not in confs: @@ -591,6 +591,16 @@ def verify_confs(confs): "The config file does not have a 'version' entry. Assuming gconstruct-v0.1") ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] + # Adjust input to DGL requirement if it is a honogeneous graph + if len(ntypes) == 1 and len(etypes) == 1 and not args.add_reverse_edges: + assert etypes[0][0] in ntypes, \ + f"source node type {etypes[0][0]} does not exist. Please check your input data." + assert etypes[0][2] in ntypes, \ + f"dest node type {etypes[0][2]} does not exist. Please check your input data." + logging.warning("Generated Graph is a homogeneous graph, so the node type will be " + "changed to _N and edge type should be changed to [_N, _E, _N]") + confs['nodes'][0]['node_type'] = "_N" + confs['edges'][0]['relation'] = ["_N", "_E", "_N"] for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." @@ -668,7 +678,7 @@ def process_graph(args): if args.num_processes_for_nodes is not None else args.num_processes num_processes_for_edges = args.num_processes_for_edges \ if args.num_processes_for_edges is not None else args.num_processes - verify_confs(process_confs) + verify_confs(process_confs, args) output_format = args.output_format for out_format in output_format: assert out_format in ["DGL", "DistDGL"], \ From 3de2100eb7ad302f8b7b83c66933ca9cbf62a2be Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Dec 2023 20:49:24 +0000 Subject: [PATCH 02/21] update unit test --- .../gconstruct/test_construct_graph.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index a03a7cbec7..22321a1b14 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -21,12 +21,13 @@ import pyarrow.parquet as pq import numpy as np import dgl +import argparse import torch as th from functools import partial from numpy.testing import assert_equal, assert_almost_equal -from graphstorm.gconstruct.construct_graph import parse_edge_data +from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs from graphstorm.gconstruct.file_io import write_data_parquet, read_data_parquet from graphstorm.gconstruct.file_io import write_data_json, read_data_json from graphstorm.gconstruct.file_io import write_data_csv, read_data_csv @@ -1705,6 +1706,26 @@ def test_gc(): assert not os.path.isdir("/tmp_featurewrapper2"), \ "Directory /tmp_featurewrapper2 should not exist after gc" + +def test_homo(): + conf = {'version': 'gconstruct-v0.1', 'nodes': [{'node_id_col': 'id', 'node_type': 'movie', 'format': {'name': 'parquet'}, + 'files': '/data/ml-100k/movie.parquet', 'features': [ + {'feature_col': 'title', + 'transform': {'name': 'bert_hf', 'bert_model': 'bert-base-uncased', 'max_seq_length': 16}}], 'labels': [ + {'label_col': 'label', 'task_type': 'classification', 'split_pct': [0.8, 0.1, 0.1]}]}], 'edges': [ + {'source_id_col': 'src_id', 'dest_id_col': 'dst_id', 'relation': ['movie', 'rating', 'movie'], + 'format': {'name': 'parquet'}, 'files': '/data/ml-100k/edges.parquet', + 'labels': [{'label_col': 'rate', 'task_type': 'classification', 'split_pct': [0.1, 0.1, 0.1]}]}]} + verify_confs(conf, rev_edges=False) + assert conf['nodes'][0]["node_type"] == "_N" + assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] + + conf['nodes'][0]["node_type"] = "movie" + conf['edges'][0]['relation'] = ['movie', 'rating', 'movie'] + verify_confs(conf, rev_edges=True) + assert conf['nodes'][0]["node_type"] == "movie" + assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"] + if __name__ == '__main__': test_parse_edge_data() test_multiprocessing_checks() From 98038fd53ce7ecee698ff4a406c9926a6834a245 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Dec 2023 21:11:18 +0000 Subject: [PATCH 03/21] add e2e test --- .../graphstorm/gconstruct/construct_graph.py | 6 +-- .../data_gen/movielens_homo.json | 44 +++++++++++++++++++ .../data_gen/process_movielens.py | 5 +++ .../data_process/movielens_test.sh | 10 ++++- 4 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 tests/end2end-tests/data_gen/movielens_homo.json diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index bf3cb6bcb6..42a63a8fac 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -582,7 +582,7 @@ def process_edge_data(process_confs, node_id_map, arr_merger, return (edges, edge_data, label_stats) -def verify_confs(confs, args=None): +def verify_confs(confs, rev_edges): """ Verify the configuration of the input data. """ if "version" not in confs: @@ -592,7 +592,7 @@ def verify_confs(confs, args=None): ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] # Adjust input to DGL requirement if it is a honogeneous graph - if len(ntypes) == 1 and len(etypes) == 1 and not args.add_reverse_edges: + if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges: assert etypes[0][0] in ntypes, \ f"source node type {etypes[0][0]} does not exist. Please check your input data." assert etypes[0][2] in ntypes, \ @@ -678,7 +678,7 @@ def process_graph(args): if args.num_processes_for_nodes is not None else args.num_processes num_processes_for_edges = args.num_processes_for_edges \ if args.num_processes_for_edges is not None else args.num_processes - verify_confs(process_confs, args) + verify_confs(process_confs, args.add_reverse_edges) output_format = args.output_format for out_format in output_format: assert out_format in ["DGL", "DistDGL"], \ diff --git a/tests/end2end-tests/data_gen/movielens_homo.json b/tests/end2end-tests/data_gen/movielens_homo.json new file mode 100644 index 0000000000..f64d4a0c2f --- /dev/null +++ b/tests/end2end-tests/data_gen/movielens_homo.json @@ -0,0 +1,44 @@ +{ + "version": "gconstruct-v0.1", + "nodes": [ + { + "node_id_col": "id", + "node_type": "movie", + "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", + "features": [ + { + "feature_col": "title", + "transform": { + "name": "bert_hf", + "bert_model": "bert-base-uncased", + "max_seq_length": 16 + } + } + ], + "labels": [ + { + "label_col": "label", + "task_type": "classification", + "split_pct": [0.8, 0.1, 0.1] + } + ] + } + ], + "edges": [ + { + "source_id_col": "src_id", + "dest_id_col": "dst_id", + "relation": ["movie", "rating", "movie"], + "format": {"name": "parquet"}, + "files": "/data/ml-100k/edges_homo.parquet", + "labels": [ + { + "label_col": "rate", + "task_type": "classification", + "split_pct": [0.1, 0.1, 0.1] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/end2end-tests/data_gen/process_movielens.py b/tests/end2end-tests/data_gen/process_movielens.py index 90fdcd1702..9ecc34de35 100644 --- a/tests/end2end-tests/data_gen/process_movielens.py +++ b/tests/end2end-tests/data_gen/process_movielens.py @@ -90,6 +90,11 @@ def write_data_parquet(data, data_file): edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]} write_data_parquet(edge_data, '/data/ml-100k/edges.parquet') +# generate data for homogeneous optimization test +edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None) +edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]} +write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet') + # generate synthetic user data with label user_labels = np.random.randint(11, size=feat.shape[0]) user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels} diff --git a/tests/end2end-tests/data_process/movielens_test.sh b/tests/end2end-tests/data_process/movielens_test.sh index 455330bc38..e47a7aaac9 100644 --- a/tests/end2end-tests/data_process/movielens_test.sh +++ b/tests/end2end-tests/data_process/movielens_test.sh @@ -5,8 +5,9 @@ service ssh restart GS_HOME=$(pwd) NUM_TRAINERS=4 export PYTHONPATH=$GS_HOME/python/ +cd $GS_HOME/training_scripts/gsgnn_np +echo "127.0.0.1" > ip_list.txt cd $GS_HOME/training_scripts/gsgnn_ep - echo "127.0.0.1" > ip_list.txt error_and_exit () { @@ -27,6 +28,13 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2 error_and_exit $? +echo "********* Test Homogeneous Graph Optimization ********" +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k +error_and_exit $? + +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N +error_and_exit $? + echo "********* Test the DistDGL graph format with BERT embeddings ********" python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges From 0917eaf8d90ad8d8b660fc91759d600a053f8e61 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Dec 2023 21:54:10 +0000 Subject: [PATCH 04/21] update --- tests/unit-tests/gconstruct/test_construct_graph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 22321a1b14..c5fd13a5f0 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1744,4 +1744,5 @@ def test_homo(): test_label() test_multicolumn(None) test_multicolumn("/") - test_feature_wrapper() \ No newline at end of file + test_feature_wrapper() + test_homo() \ No newline at end of file From b68d01b1a49e8ddcf1f9b8a893e18552c53db060 Mon Sep 17 00:00:00 2001 From: jalencato Date: Wed, 13 Dec 2023 10:17:12 -0800 Subject: [PATCH 05/21] Apply suggestions from code review Co-authored-by: xiang song(charlie.song) --- python/graphstorm/gconstruct/construct_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 42a63a8fac..05d388f54c 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -591,14 +591,14 @@ def verify_confs(confs, rev_edges): "The config file does not have a 'version' entry. Assuming gconstruct-v0.1") ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] - # Adjust input to DGL requirement if it is a honogeneous graph + # Adjust input to DGL homogeneous graph format if it is a homogeneous graph if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges: assert etypes[0][0] in ntypes, \ f"source node type {etypes[0][0]} does not exist. Please check your input data." assert etypes[0][2] in ntypes, \ f"dest node type {etypes[0][2]} does not exist. Please check your input data." logging.warning("Generated Graph is a homogeneous graph, so the node type will be " - "changed to _N and edge type should be changed to [_N, _E, _N]") + "changed to _N and edge type will be changed to [_N, _E, _N]") confs['nodes'][0]['node_type'] = "_N" confs['edges'][0]['relation'] = ["_N", "_E", "_N"] for etype in etypes: From 5240748a87ef44a3258f1279693b88bdac275973 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 13 Dec 2023 19:16:25 +0000 Subject: [PATCH 06/21] fix bug --- .../graphstorm/gconstruct/construct_graph.py | 11 ++++++-- .../data_gen/movielens_homo.json | 15 ++++++++++ .../gconstruct/test_construct_graph.py | 28 +++++++++++++++++++ 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 05d388f54c..1a47782dc6 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -584,6 +584,10 @@ def process_edge_data(process_confs, node_id_map, arr_merger, def verify_confs(confs, rev_edges): """ Verify the configuration of the input data. + Parameters + ---------- + rev_edges: bool + Whether to add reverse edges """ if "version" not in confs: # TODO: Make a requirement with v1.0 launch @@ -592,15 +596,16 @@ def verify_confs(confs, rev_edges): ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] # Adjust input to DGL homogeneous graph format if it is a homogeneous graph - if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges: + etype_set = set(tuple(relation) for relation in etypes) + if len(ntypes) == 1 and len(etype_set) == 1 and not rev_edges: assert etypes[0][0] in ntypes, \ f"source node type {etypes[0][0]} does not exist. Please check your input data." assert etypes[0][2] in ntypes, \ f"dest node type {etypes[0][2]} does not exist. Please check your input data." logging.warning("Generated Graph is a homogeneous graph, so the node type will be " "changed to _N and edge type will be changed to [_N, _E, _N]") - confs['nodes'][0]['node_type'] = "_N" - confs['edges'][0]['relation'] = ["_N", "_E", "_N"] + [node.update({'node_type': "_N"}) for node in confs['nodes']] + [edge.update({'relation': ["_N", "_E", "_N"]}) for edge in confs['edges']] for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." diff --git a/tests/end2end-tests/data_gen/movielens_homo.json b/tests/end2end-tests/data_gen/movielens_homo.json index f64d4a0c2f..c0f7457859 100644 --- a/tests/end2end-tests/data_gen/movielens_homo.json +++ b/tests/end2end-tests/data_gen/movielens_homo.json @@ -23,6 +23,16 @@ "split_pct": [0.8, 0.1, 0.1] } ] + }, + { + "node_type": "movie", + "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", + "features": [ + { + "feature_col": "id" + } + ] } ], "edges": [ @@ -39,6 +49,11 @@ "split_pct": [0.1, 0.1, 0.1] } ] + }, + { + "relation": ["movie", "rating", "movie"], + "format": {"name": "parquet"}, + "files": "/data/ml-100k/edges_homo.parquet" } ] } \ No newline at end of file diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index c5fd13a5f0..93d8bd38c0 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1708,6 +1708,7 @@ def test_gc(): def test_homo(): + # single node type and edge type input conf = {'version': 'gconstruct-v0.1', 'nodes': [{'node_id_col': 'id', 'node_type': 'movie', 'format': {'name': 'parquet'}, 'files': '/data/ml-100k/movie.parquet', 'features': [ {'feature_col': 'title', @@ -1726,6 +1727,33 @@ def test_homo(): assert conf['nodes'][0]["node_type"] == "movie" assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"] + # multiple node types and edge types input + conf = { + "version": "gconstruct-v0.1", "nodes": [ + {"node_id_col": "id", "node_type": "movie", "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", "features": [ + {"feature_col": "title", "transform": { + "name": "bert_hf", "bert_model": "bert-base-uncased", "max_seq_length": 16}}], + "labels": [{"label_col": "label", "task_type": "classification", "split_pct": [0.8, 0.1, 0.1]}]}, + {"node_type": "movie", "format": {"name": "parquet"}, "files": "/data/ml-100k/movie.parquet", + "features": [{"feature_col": "id"}]}], + "edges": [ + {"source_id_col": "src_id", "dest_id_col": "dst_id", "relation": ["movie", "rating", "movie"], + "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [ + {"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}, + {"relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, + "files": "/data/ml-100k/edges_homo.parquet"}] + } + verify_confs(conf, rev_edges=False) + assert conf['nodes'][0]["node_type"] == "_N" + assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] + + conf['nodes'][0]["node_type"] = "movie" + conf['edges'][0]['relation'] = ['movie', 'rating', 'movie'] + verify_confs(conf, rev_edges=True) + assert conf['nodes'][0]["node_type"] == "movie" + assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"] + if __name__ == '__main__': test_parse_edge_data() test_multiprocessing_checks() From 1d63921898df588e7e43b40668c57cdb8d615a2d Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 13 Dec 2023 19:20:25 +0000 Subject: [PATCH 07/21] reformat --- .../gconstruct/test_construct_graph.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 93d8bd38c0..35fa176313 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1709,14 +1709,18 @@ def test_gc(): def test_homo(): # single node type and edge type input - conf = {'version': 'gconstruct-v0.1', 'nodes': [{'node_id_col': 'id', 'node_type': 'movie', 'format': {'name': 'parquet'}, - 'files': '/data/ml-100k/movie.parquet', 'features': [ - {'feature_col': 'title', - 'transform': {'name': 'bert_hf', 'bert_model': 'bert-base-uncased', 'max_seq_length': 16}}], 'labels': [ - {'label_col': 'label', 'task_type': 'classification', 'split_pct': [0.8, 0.1, 0.1]}]}], 'edges': [ - {'source_id_col': 'src_id', 'dest_id_col': 'dst_id', 'relation': ['movie', 'rating', 'movie'], - 'format': {'name': 'parquet'}, 'files': '/data/ml-100k/edges.parquet', - 'labels': [{'label_col': 'rate', 'task_type': 'classification', 'split_pct': [0.1, 0.1, 0.1]}]}]} + conf = { + "version": "gconstruct-v0.1", "nodes": [ + {"node_id_col": "id", "node_type": "movie", "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", "features": [ + {"feature_col": "title", "transform": { + "name": "bert_hf", "bert_model": "bert-base-uncased", "max_seq_length": 16}}], + "labels": [{"label_col": "label", "task_type": "classification", "split_pct": [0.8, 0.1, 0.1]}]}], + "edges": [ + {"source_id_col": "src_id", "dest_id_col": "dst_id", "relation": ["movie", "rating", "movie"], + "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [ + {"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}] + } verify_confs(conf, rev_edges=False) assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] From d856178e44a3a4668f775b3847826ab3246c6030 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 13 Dec 2023 19:56:32 +0000 Subject: [PATCH 08/21] add test --- python/graphstorm/gconstruct/construct_graph.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 1a47782dc6..6eb1b65678 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -604,8 +604,10 @@ def verify_confs(confs, rev_edges): f"dest node type {etypes[0][2]} does not exist. Please check your input data." logging.warning("Generated Graph is a homogeneous graph, so the node type will be " "changed to _N and edge type will be changed to [_N, _E, _N]") - [node.update({'node_type': "_N"}) for node in confs['nodes']] - [edge.update({'relation': ["_N", "_E", "_N"]}) for edge in confs['edges']] + for node in confs['nodes']: + node['node_type'] = "_N" + for edge in confs['edges']: + edge['relation'] = ["_N", "_E", "_N"] for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." From 53b8c57fde9ed2e3a999e7a3996cdabba65ffa67 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 13 Dec 2023 20:04:17 +0000 Subject: [PATCH 09/21] update --- tests/unit-tests/gconstruct/test_construct_graph.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 35fa176313..3025dfc031 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -21,7 +21,6 @@ import pyarrow.parquet as pq import numpy as np import dgl -import argparse import torch as th from functools import partial From 2c9ec4e8d56080900309df989e8a7277df86f32f Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 21:06:56 +0000 Subject: [PATCH 10/21] apply comments about reverse edges --- .github/workflow_scripts/e2e_check.sh | 1 + .../graphstorm/gconstruct/construct_graph.py | 58 ++++++++++++++----- ...s_homo.json => movielens_homogeneous.json} | 4 ++ .../data_process/homogeneous_test.sh | 35 +++++++++++ .../data_process/movielens_test.sh | 7 --- .../gconstruct/test_construct_graph.py | 4 +- 6 files changed, 85 insertions(+), 24 deletions(-) rename tests/end2end-tests/data_gen/{movielens_homo.json => movielens_homogeneous.json} (93%) create mode 100644 tests/end2end-tests/data_process/homogeneous_test.sh diff --git a/.github/workflow_scripts/e2e_check.sh b/.github/workflow_scripts/e2e_check.sh index 9851a35529..8c122c9f9d 100644 --- a/.github/workflow_scripts/e2e_check.sh +++ b/.github/workflow_scripts/e2e_check.sh @@ -8,6 +8,7 @@ sh ./tests/end2end-tests/create_data.sh sh ./tests/end2end-tests/tools/test_mem_est.sh sh ./tests/end2end-tests/data_process/test.sh sh ./tests/end2end-tests/data_process/movielens_test.sh +sh ./tests/end2end-tests/data_process/homogeneous_test.sh sh ./tests/end2end-tests/custom-gnn/run_test.sh bash ./tests/end2end-tests/graphstorm-nc/test.sh bash ./tests/end2end-tests/graphstorm-lp/test.sh diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 6eb1b65678..8939d07605 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -28,6 +28,7 @@ import numpy as np import torch as th import dgl +from dgl.distributed.constants import DEFAULT_NTYPE, DEFAULT_ETYPE from ..utils import sys_tracker, get_log_level from .file_io import parse_node_file_format, parse_edge_file_format @@ -582,12 +583,23 @@ def process_edge_data(process_confs, node_id_map, arr_merger, return (edges, edge_data, label_stats) -def verify_confs(confs, rev_edges): +def is_homogeneous(confs): + """ Verify if it is a homogeneous graph + Parameter + --------- + confs: dict + A dict containing all user input config + """ + ntypes = {conf['node_type'] for conf in confs["nodes"]} + etypes = set(tuple(conf['relation']) for conf in confs["edges"]) + return len(ntypes) == 1 and len(etypes) == 1 + +def verify_confs(confs): """ Verify the configuration of the input data. - Parameters - ---------- - rev_edges: bool - Whether to add reverse edges + Parameter + --------- + confs: dict + A dict containing all user input config """ if "version" not in confs: # TODO: Make a requirement with v1.0 launch @@ -596,8 +608,7 @@ def verify_confs(confs, rev_edges): ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] # Adjust input to DGL homogeneous graph format if it is a homogeneous graph - etype_set = set(tuple(relation) for relation in etypes) - if len(ntypes) == 1 and len(etype_set) == 1 and not rev_edges: + if is_homogeneous(confs): assert etypes[0][0] in ntypes, \ f"source node type {etypes[0][0]} does not exist. Please check your input data." assert etypes[0][2] in ntypes, \ @@ -605,9 +616,9 @@ def verify_confs(confs, rev_edges): logging.warning("Generated Graph is a homogeneous graph, so the node type will be " "changed to _N and edge type will be changed to [_N, _E, _N]") for node in confs['nodes']: - node['node_type'] = "_N" + node['node_type'] = DEFAULT_NTYPE for edge in confs['edges']: - edge['relation'] = ["_N", "_E", "_N"] + edge['relation'] = DEFAULT_ETYPE for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." @@ -685,7 +696,7 @@ def process_graph(args): if args.num_processes_for_nodes is not None else args.num_processes num_processes_for_edges = args.num_processes_for_edges \ if args.num_processes_for_edges is not None else args.num_processes - verify_confs(process_confs, args.add_reverse_edges) + verify_confs(process_confs) output_format = args.output_format for out_format in output_format: assert out_format in ["DGL", "DistDGL"], \ @@ -715,12 +726,29 @@ def process_graph(args): if args.add_reverse_edges: edges1 = {} - for etype in edges: - e = edges[etype] + if is_homogeneous(process_confs): + logging.warning("For homogeneous graph, the generated reverse edge will " + "be the same edge type as the original graph. Instead for " + "heterogeneous graph, the generated reverse edge type will " + "add -rev as a suffix") + e = edges[DEFAULT_ETYPE] assert isinstance(e, tuple) and len(e) == 2 - assert isinstance(etype, tuple) and len(etype) == 3 - edges1[etype] = e - edges1[etype[2], etype[1] + "-rev", etype[0]] = (e[1], e[0]) + edges1[DEFAULT_ETYPE] = e + edges1[DEFAULT_ETYPE] = (np.concatenate([e[0], e[1]]), + np.concatenate([e[1], e[0]])) + if DEFAULT_ETYPE in edge_data: + data = edge_data[DEFAULT_ETYPE] + logging.warning("Reverse edge for homogeneous graph will have same feature as " + "what we have in the original edges") + for key, value in data.items(): + data[key] = np.concatenate([value, value]) + else: + for etype in edges: + e = edges[etype] + assert isinstance(e, tuple) and len(e) == 2 + assert isinstance(etype, tuple) and len(etype) == 3 + edges1[etype] = e + edges1[etype[2], etype[1] + "-rev", etype[0]] = (e[1], e[0]) edges = edges1 sys_tracker.check('Add reverse edges') g = dgl.heterograph(edges, num_nodes_dict=num_nodes) diff --git a/tests/end2end-tests/data_gen/movielens_homo.json b/tests/end2end-tests/data_gen/movielens_homogeneous.json similarity index 93% rename from tests/end2end-tests/data_gen/movielens_homo.json rename to tests/end2end-tests/data_gen/movielens_homogeneous.json index c0f7457859..79e3e27ceb 100644 --- a/tests/end2end-tests/data_gen/movielens_homo.json +++ b/tests/end2end-tests/data_gen/movielens_homogeneous.json @@ -42,6 +42,10 @@ "relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", + "features": [ + { + "feature_col": "rate" + }], "labels": [ { "label_col": "rate", diff --git a/tests/end2end-tests/data_process/homogeneous_test.sh b/tests/end2end-tests/data_process/homogeneous_test.sh new file mode 100644 index 0000000000..e76c4ef741 --- /dev/null +++ b/tests/end2end-tests/data_process/homogeneous_test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +service ssh restart + +GS_HOME=$(pwd) +NUM_TRAINERS=4 +export PYTHONPATH=$GS_HOME/python/ +cd $GS_HOME/training_scripts/gsgnn_np +echo "127.0.0.1" > ip_list.txt +cd $GS_HOME/training_scripts/gsgnn_ep +echo "127.0.0.1" > ip_list.txt + +error_and_exit () { + # check exec status of launch.py + status=$1 + echo $status + + if test $status -ne 0 + then + exit -1 + fi +} + + +echo "********* Test Homogeneous Graph Optimization ********" +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogenous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k +error_and_exit $? + +echo "********* Test Node Classification on GConstruct Homogeneous Graph ********" +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N +error_and_exit $? + +echo "********* Test Edge Classification on GConstruct Homogeneous Graph ********" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N +error_and_exit $? \ No newline at end of file diff --git a/tests/end2end-tests/data_process/movielens_test.sh b/tests/end2end-tests/data_process/movielens_test.sh index e47a7aaac9..200d1f8764 100644 --- a/tests/end2end-tests/data_process/movielens_test.sh +++ b/tests/end2end-tests/data_process/movielens_test.sh @@ -28,13 +28,6 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2 error_and_exit $? -echo "********* Test Homogeneous Graph Optimization ********" -python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k -error_and_exit $? - -python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N -error_and_exit $? - echo "********* Test the DistDGL graph format with BERT embeddings ********" python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 3025dfc031..873e68d1b7 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1706,7 +1706,7 @@ def test_gc(): "Directory /tmp_featurewrapper2 should not exist after gc" -def test_homo(): +def test_homogeneous(): # single node type and edge type input conf = { "version": "gconstruct-v0.1", "nodes": [ @@ -1776,4 +1776,4 @@ def test_homo(): test_multicolumn(None) test_multicolumn("/") test_feature_wrapper() - test_homo() \ No newline at end of file + test_homogeneous() \ No newline at end of file From 27a7c9df492fb9863305839c1e8b58c090aa6eab Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 21:11:58 +0000 Subject: [PATCH 11/21] fix unit test --- python/graphstorm/gconstruct/construct_graph.py | 3 ++- .../gconstruct/test_construct_graph.py | 16 ++-------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 8939d07605..56d28e87f2 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -618,7 +618,7 @@ def verify_confs(confs): for node in confs['nodes']: node['node_type'] = DEFAULT_NTYPE for edge in confs['edges']: - edge['relation'] = DEFAULT_ETYPE + edge['relation'] = list(DEFAULT_ETYPE) for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." @@ -736,6 +736,7 @@ def process_graph(args): edges1[DEFAULT_ETYPE] = e edges1[DEFAULT_ETYPE] = (np.concatenate([e[0], e[1]]), np.concatenate([e[1], e[0]])) + # Double edge feature as it is necessary to match tensor size in generated graph if DEFAULT_ETYPE in edge_data: data = edge_data[DEFAULT_ETYPE] logging.warning("Reverse edge for homogeneous graph will have same feature as " diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 873e68d1b7..672105d435 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1720,16 +1720,10 @@ def test_homogeneous(): "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [ {"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}] } - verify_confs(conf, rev_edges=False) + verify_confs(conf) assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] - conf['nodes'][0]["node_type"] = "movie" - conf['edges'][0]['relation'] = ['movie', 'rating', 'movie'] - verify_confs(conf, rev_edges=True) - assert conf['nodes'][0]["node_type"] == "movie" - assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"] - # multiple node types and edge types input conf = { "version": "gconstruct-v0.1", "nodes": [ @@ -1747,16 +1741,10 @@ def test_homogeneous(): {"relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet"}] } - verify_confs(conf, rev_edges=False) + verify_confs(conf) assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] - conf['nodes'][0]["node_type"] = "movie" - conf['edges'][0]['relation'] = ['movie', 'rating', 'movie'] - verify_confs(conf, rev_edges=True) - assert conf['nodes'][0]["node_type"] == "movie" - assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"] - if __name__ == '__main__': test_parse_edge_data() test_multiprocessing_checks() From f71d8969e815a4f34f818a9ee008299676b3b795 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 21:16:19 +0000 Subject: [PATCH 12/21] test --- tests/end2end-tests/data_process/homogeneous_test.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/end2end-tests/data_process/homogeneous_test.sh b/tests/end2end-tests/data_process/homogeneous_test.sh index e76c4ef741..4f8db47e65 100644 --- a/tests/end2end-tests/data_process/homogeneous_test.sh +++ b/tests/end2end-tests/data_process/homogeneous_test.sh @@ -32,4 +32,16 @@ error_and_exit $? echo "********* Test Edge Classification on GConstruct Homogeneous Graph ********" python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N +error_and_exit $? + +echo "********* Test Homogeneous Graph Optimization on reverse edge********" +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogenous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k --add-reverse-edges +error_and_exit $? + +echo "********* Test Node Classification on GConstruct Homogeneous Graph on reverse edge********" +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N +error_and_exit $? + +echo "********* Test Edge Classification on GConstruct Homogeneous Graph on reverse edge ********" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N error_and_exit $? \ No newline at end of file From 9fd4a7ab4e1360e54fe3c0a7b0f59cd1bf4135ee Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 22:42:17 +0000 Subject: [PATCH 13/21] add reverse edge check --- .../graphstorm/gconstruct/construct_graph.py | 7 ++- .../data_process/check_homogeneous.py | 60 +++++++++++++++++++ .../data_process/homogeneous_test.sh | 14 +++-- 3 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 tests/end2end-tests/data_process/check_homogeneous.py diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 56d28e87f2..096f9666b7 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -737,12 +737,17 @@ def process_graph(args): edges1[DEFAULT_ETYPE] = (np.concatenate([e[0], e[1]]), np.concatenate([e[1], e[0]])) # Double edge feature as it is necessary to match tensor size in generated graph + # Only generate mask on original graph if DEFAULT_ETYPE in edge_data: data = edge_data[DEFAULT_ETYPE] logging.warning("Reverse edge for homogeneous graph will have same feature as " "what we have in the original edges") for key, value in data.items(): - data[key] = np.concatenate([value, value]) + if key not in ["train_mask", "test_mask", "val_mask"]: + data[key] = np.concatenate([value, value]) + else: + data[key] = np.concatenate([value, [0]*len(value)]) + else: for etype in edges: e = edges[etype] diff --git a/tests/end2end-tests/data_process/check_homogeneous.py b/tests/end2end-tests/data_process/check_homogeneous.py new file mode 100644 index 0000000000..daeb6f0ada --- /dev/null +++ b/tests/end2end-tests/data_process/check_homogeneous.py @@ -0,0 +1,60 @@ +""" + Copyright 2023 Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +""" +import os +import argparse +import dgl +from dgl.distributed.constants import DEFAULT_NTYPE, DEFAULT_ETYPE +from numpy.testing import assert_almost_equal + + +def check_reverse_edge(args): + + g_orig = dgl.load_graphs(os.path.join(args.orig_graph_path, "graph.dgl"))[0][0] + g_rev = dgl.load_graphs(os.path.join(args.rev_graph_path, "graph.dgl"))[0][0] + assert g_orig.ntypes == g_rev.ntypes + assert g_orig.etypes == g_rev.etypes + assert g_orig.number_of_nodes(DEFAULT_NTYPE) == g_rev.number_of_nodes(DEFAULT_NTYPE) + assert 2 * g_orig.number_of_edges(DEFAULT_ETYPE) == g_rev.number_of_edges(DEFAULT_ETYPE) + for ntype in g_orig.ntypes: + assert g_orig.number_of_nodes(ntype) == g_rev.number_of_nodes(ntype) + for name in g_orig.nodes[ntype].data: + # We should skip '*_mask' because data split is split randomly. + if 'mask' not in name: + assert_almost_equal(g_orig.nodes[ntype].data[name].numpy(), + g_rev.nodes[ntype].data[name].numpy()) + + # Check edge feature + g_orig_feat = dgl.data.load_tensors(os.path.join(args.orig_graph_path, "edge_feat.dgl")) + g_rev_feat = dgl.data.load_tensors(os.path.join(args.rev_graph_path, "edge_feat.dgl")) + for feat_type in g_orig_feat.keys(): + if "mask" not in feat_type: + assert_almost_equal(g_orig_feat[feat_type].numpy(), + g_rev_feat[feat_type].numpy()[:g_orig.number_of_edges(DEFAULT_ETYPE)]) + else: + assert_almost_equal(g_rev_feat[feat_type].numpy()[g_orig.number_of_edges(DEFAULT_ETYPE):], + [0] * g_orig.number_of_edges(DEFAULT_ETYPE)) + +if __name__ == '__main__': + argparser = argparse.ArgumentParser("Check edge prediction remapping") + argparser.add_argument("--orig-graph-path", type=str, default="/tmp/movielen_100k_train_val_1p_4t_homogeneous/part0/", + help="Path to save the generated data") + argparser.add_argument("--rev-graph-path", type=str, default="/tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/part0/", + help="Path to save the generated data") + + args = argparser.parse_args() + + check_reverse_edge(args) \ No newline at end of file diff --git a/tests/end2end-tests/data_process/homogeneous_test.sh b/tests/end2end-tests/data_process/homogeneous_test.sh index 4f8db47e65..7f96d004fe 100644 --- a/tests/end2end-tests/data_process/homogeneous_test.sh +++ b/tests/end2end-tests/data_process/homogeneous_test.sh @@ -23,7 +23,10 @@ error_and_exit () { echo "********* Test Homogeneous Graph Optimization ********" -python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogenous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogeneous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k +error_and_exit $? + +python3 $GS_HOME/tests/end2end-tests/data_process/check_homogeneous.py error_and_exit $? echo "********* Test Node Classification on GConstruct Homogeneous Graph ********" @@ -35,13 +38,16 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s error_and_exit $? echo "********* Test Homogeneous Graph Optimization on reverse edge********" -python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogenous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k --add-reverse-edges +python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogeneous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev --graph-name movie-lens-100k --add-reverse-edges +error_and_exit $? + +python3 $GS_HOME/tests/end2end-tests/data_process/check_homogeneous.py error_and_exit $? echo "********* Test Node Classification on GConstruct Homogeneous Graph on reverse edge********" -python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N error_and_exit $? echo "********* Test Edge Classification on GConstruct Homogeneous Graph on reverse edge ********" -python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N error_and_exit $? \ No newline at end of file From 7ca956390d82e72d0487b0fa723a91cae7edb392 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 23:02:19 +0000 Subject: [PATCH 14/21] change name --- tests/end2end-tests/data_gen/movielens_homogeneous.json | 4 ++-- tests/end2end-tests/data_gen/process_movielens.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/end2end-tests/data_gen/movielens_homogeneous.json b/tests/end2end-tests/data_gen/movielens_homogeneous.json index 79e3e27ceb..018776e82e 100644 --- a/tests/end2end-tests/data_gen/movielens_homogeneous.json +++ b/tests/end2end-tests/data_gen/movielens_homogeneous.json @@ -41,7 +41,7 @@ "dest_id_col": "dst_id", "relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, - "files": "/data/ml-100k/edges_homo.parquet", + "files": "/data/ml-100k/edges_homogeneous.parquet", "features": [ { "feature_col": "rate" @@ -57,7 +57,7 @@ { "relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, - "files": "/data/ml-100k/edges_homo.parquet" + "files": "/data/ml-100k/edges_homogeneous.parquet" } ] } \ No newline at end of file diff --git a/tests/end2end-tests/data_gen/process_movielens.py b/tests/end2end-tests/data_gen/process_movielens.py index 9ecc34de35..a9ca90873e 100644 --- a/tests/end2end-tests/data_gen/process_movielens.py +++ b/tests/end2end-tests/data_gen/process_movielens.py @@ -93,7 +93,7 @@ def write_data_parquet(data, data_file): # generate data for homogeneous optimization test edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None) edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]} -write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet') +write_data_parquet(edge_data, '/data/ml-100k/edges_homogeneous.parquet') # generate synthetic user data with label user_labels = np.random.randint(11, size=feat.shape[0]) From ee2762e6c1af3f259183780093862b2c86f9240d Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 23:04:02 +0000 Subject: [PATCH 15/21] remove redundant --- python/graphstorm/gconstruct/construct_graph.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 096f9666b7..5b32a72686 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -733,7 +733,6 @@ def process_graph(args): "add -rev as a suffix") e = edges[DEFAULT_ETYPE] assert isinstance(e, tuple) and len(e) == 2 - edges1[DEFAULT_ETYPE] = e edges1[DEFAULT_ETYPE] = (np.concatenate([e[0], e[1]]), np.concatenate([e[1], e[0]])) # Double edge feature as it is necessary to match tensor size in generated graph From 7a0824f484ec03e45fe9943cc6e41f668236aabd Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 14 Dec 2023 23:52:31 +0000 Subject: [PATCH 16/21] change order --- tests/end2end-tests/data_process/homogeneous_test.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/end2end-tests/data_process/homogeneous_test.sh b/tests/end2end-tests/data_process/homogeneous_test.sh index 7f96d004fe..fe42732ebf 100644 --- a/tests/end2end-tests/data_process/homogeneous_test.sh +++ b/tests/end2end-tests/data_process/homogeneous_test.sh @@ -26,9 +26,6 @@ echo "********* Test Homogeneous Graph Optimization ********" python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homogeneous.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homogeneous --graph-name movie-lens-100k error_and_exit $? -python3 $GS_HOME/tests/end2end-tests/data_process/check_homogeneous.py -error_and_exit $? - echo "********* Test Node Classification on GConstruct Homogeneous Graph ********" python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N error_and_exit $? From debc2ff042c07dc29c34c0ff19011f5b334444b8 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 15 Dec 2023 00:40:46 +0000 Subject: [PATCH 17/21] add --- python/graphstorm/gconstruct/construct_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 5b32a72686..6f63c3ee6e 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -737,7 +737,7 @@ def process_graph(args): np.concatenate([e[1], e[0]])) # Double edge feature as it is necessary to match tensor size in generated graph # Only generate mask on original graph - if DEFAULT_ETYPE in edge_data: + if edge_data: data = edge_data[DEFAULT_ETYPE] logging.warning("Reverse edge for homogeneous graph will have same feature as " "what we have in the original edges") From 77b4b5f8292845108232d06e14426860cf4f54a8 Mon Sep 17 00:00:00 2001 From: jalencato Date: Fri, 15 Dec 2023 11:03:22 -0800 Subject: [PATCH 18/21] Apply suggestions from code review Co-authored-by: xiang song(charlie.song) --- python/graphstorm/gconstruct/construct_graph.py | 2 +- tests/end2end-tests/data_process/homogeneous_test.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 6f63c3ee6e..b16b4ef936 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -745,7 +745,7 @@ def process_graph(args): if key not in ["train_mask", "test_mask", "val_mask"]: data[key] = np.concatenate([value, value]) else: - data[key] = np.concatenate([value, [0]*len(value)]) + data[key] = np.concatenate([value, numpy.zeros(value.shape, dtype=value.dtype)]) else: for etype in edges: diff --git a/tests/end2end-tests/data_process/homogeneous_test.sh b/tests/end2end-tests/data_process/homogeneous_test.sh index fe42732ebf..dd3f55907d 100644 --- a/tests/end2end-tests/data_process/homogeneous_test.sh +++ b/tests/end2end-tests/data_process/homogeneous_test.sh @@ -41,10 +41,10 @@ error_and_exit $? python3 $GS_HOME/tests/end2end-tests/data_process/check_homogeneous.py error_and_exit $? -echo "********* Test Node Classification on GConstruct Homogeneous Graph on reverse edge********" +echo "********* Test Node Classification on GConstruct Homogeneous Graph with reverse edge********" python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N error_and_exit $? -echo "********* Test Edge Classification on GConstruct Homogeneous Graph on reverse edge ********" +echo "********* Test Edge Classification on GConstruct Homogeneous Graph with reverse edge ********" python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N error_and_exit $? \ No newline at end of file From f02c0d61cd057b1fc9ba0c6fbf2488336112028d Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 15 Dec 2023 19:46:35 +0000 Subject: [PATCH 19/21] apply comments --- .../graphstorm/gconstruct/construct_graph.py | 16 ++++++------ .../data_process/movielens_test.sh | 3 +-- .../gconstruct/test_construct_graph.py | 25 ++++++++++++++++++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index b16b4ef936..ebd8143593 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -591,8 +591,13 @@ def is_homogeneous(confs): A dict containing all user input config """ ntypes = {conf['node_type'] for conf in confs["nodes"]} - etypes = set(tuple(conf['relation']) for conf in confs["edges"]) - return len(ntypes) == 1 and len(etypes) == 1 + etypes = [conf['relation'] for conf in confs["edges"]] + etypes_set = set(tuple(conf['relation']) for conf in confs["edges"]) + assert etypes[0][0] in ntypes, \ + f"source node type {etypes[0][0]} does not exist. Please check your input data." + assert etypes[0][2] in ntypes, \ + f"dest node type {etypes[0][2]} does not exist. Please check your input data." + return len(ntypes) == 1 and len(etypes_set) == 1 def verify_confs(confs): """ Verify the configuration of the input data. @@ -609,10 +614,6 @@ def verify_confs(confs): etypes = [conf['relation'] for conf in confs["edges"]] # Adjust input to DGL homogeneous graph format if it is a homogeneous graph if is_homogeneous(confs): - assert etypes[0][0] in ntypes, \ - f"source node type {etypes[0][0]} does not exist. Please check your input data." - assert etypes[0][2] in ntypes, \ - f"dest node type {etypes[0][2]} does not exist. Please check your input data." logging.warning("Generated Graph is a homogeneous graph, so the node type will be " "changed to _N and edge type will be changed to [_N, _E, _N]") for node in confs['nodes']: @@ -745,7 +746,8 @@ def process_graph(args): if key not in ["train_mask", "test_mask", "val_mask"]: data[key] = np.concatenate([value, value]) else: - data[key] = np.concatenate([value, numpy.zeros(value.shape, dtype=value.dtype)]) + data[key] = np.concatenate([value, np.zeros(value.shape, + dtype=value.dtype)]) else: for etype in edges: diff --git a/tests/end2end-tests/data_process/movielens_test.sh b/tests/end2end-tests/data_process/movielens_test.sh index 200d1f8764..455330bc38 100644 --- a/tests/end2end-tests/data_process/movielens_test.sh +++ b/tests/end2end-tests/data_process/movielens_test.sh @@ -5,9 +5,8 @@ service ssh restart GS_HOME=$(pwd) NUM_TRAINERS=4 export PYTHONPATH=$GS_HOME/python/ -cd $GS_HOME/training_scripts/gsgnn_np -echo "127.0.0.1" > ip_list.txt cd $GS_HOME/training_scripts/gsgnn_ep + echo "127.0.0.1" > ip_list.txt error_and_exit () { diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 672105d435..9913f620d8 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ +import copy import random import os import tempfile @@ -22,11 +23,12 @@ import numpy as np import dgl import torch as th +import copy from functools import partial from numpy.testing import assert_equal, assert_almost_equal -from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs +from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs, is_homogeneous from graphstorm.gconstruct.file_io import write_data_parquet, read_data_parquet from graphstorm.gconstruct.file_io import write_data_json, read_data_json from graphstorm.gconstruct.file_io import write_data_csv, read_data_csv @@ -1720,9 +1722,20 @@ def test_homogeneous(): "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [ {"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}] } + assert is_homogeneous(conf) verify_confs(conf) assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] + conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"] + try: + is_homogeneous(conf) + except AssertionError as e: + assert str(e) == "source node type movie_fake does not exist. Please check your input data." + conf["nodes"].append(copy.deepcopy(conf["nodes"][0])) + conf["nodes"][0]["node_type"] = "movie" + conf["nodes"][1]["node_type"] = "movie_fake" + assert not is_homogeneous(conf) + # multiple node types and edge types input conf = { @@ -1741,9 +1754,19 @@ def test_homogeneous(): {"relation": ["movie", "rating", "movie"], "format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet"}] } + assert is_homogeneous(conf) verify_confs(conf) assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] + conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"] + try: + is_homogeneous(conf) + except AssertionError as e: + assert str(e) == "source node type movie_fake does not exist. Please check your input data." + conf["nodes"].append(copy.deepcopy(conf["nodes"][0])) + conf["nodes"][0]["node_type"] = "movie" + conf["nodes"][1]["node_type"] = "movie_fake" + assert not is_homogeneous(conf) if __name__ == '__main__': test_parse_edge_data() From 5dca4ad081c5773c73d7c58f476d60c8146ddd62 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 15 Dec 2023 19:54:25 +0000 Subject: [PATCH 20/21] refactor --- .../graphstorm/gconstruct/construct_graph.py | 25 ++++++++----------- .../gconstruct/test_construct_graph.py | 8 ------ 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index ebd8143593..259399328b 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -591,13 +591,8 @@ def is_homogeneous(confs): A dict containing all user input config """ ntypes = {conf['node_type'] for conf in confs["nodes"]} - etypes = [conf['relation'] for conf in confs["edges"]] - etypes_set = set(tuple(conf['relation']) for conf in confs["edges"]) - assert etypes[0][0] in ntypes, \ - f"source node type {etypes[0][0]} does not exist. Please check your input data." - assert etypes[0][2] in ntypes, \ - f"dest node type {etypes[0][2]} does not exist. Please check your input data." - return len(ntypes) == 1 and len(etypes_set) == 1 + etypes = set(tuple(conf['relation']) for conf in confs["edges"]) + return len(ntypes) == 1 and len(etypes) == 1 def verify_confs(confs): """ Verify the configuration of the input data. @@ -612,14 +607,6 @@ def verify_confs(confs): "The config file does not have a 'version' entry. Assuming gconstruct-v0.1") ntypes = {conf['node_type'] for conf in confs["nodes"]} etypes = [conf['relation'] for conf in confs["edges"]] - # Adjust input to DGL homogeneous graph format if it is a homogeneous graph - if is_homogeneous(confs): - logging.warning("Generated Graph is a homogeneous graph, so the node type will be " - "changed to _N and edge type will be changed to [_N, _E, _N]") - for node in confs['nodes']: - node['node_type'] = DEFAULT_NTYPE - for edge in confs['edges']: - edge['relation'] = list(DEFAULT_ETYPE) for etype in etypes: assert len(etype) == 3, \ "The edge type must be (source node type, relation type, dest node type)." @@ -628,6 +615,14 @@ def verify_confs(confs): f"source node type {src_type} does not exist. Please check your input data." assert dst_type in ntypes, \ f"dest node type {dst_type} does not exist. Please check your input data." + # Adjust input to DGL homogeneous graph format if it is a homogeneous graph + if is_homogeneous(confs): + logging.warning("Generated Graph is a homogeneous graph, so the node type will be " + "changed to _N and edge type will be changed to [_N, _E, _N]") + for node in confs['nodes']: + node['node_type'] = DEFAULT_NTYPE + for edge in confs['edges']: + edge['relation'] = list(DEFAULT_ETYPE) def print_graph_info(g, node_data, edge_data, node_label_stats, edge_label_stats): """ Print graph information. diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 9913f620d8..384d36fdd3 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -1727,10 +1727,6 @@ def test_homogeneous(): assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"] - try: - is_homogeneous(conf) - except AssertionError as e: - assert str(e) == "source node type movie_fake does not exist. Please check your input data." conf["nodes"].append(copy.deepcopy(conf["nodes"][0])) conf["nodes"][0]["node_type"] = "movie" conf["nodes"][1]["node_type"] = "movie_fake" @@ -1759,10 +1755,6 @@ def test_homogeneous(): assert conf['nodes'][0]["node_type"] == "_N" assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"] conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"] - try: - is_homogeneous(conf) - except AssertionError as e: - assert str(e) == "source node type movie_fake does not exist. Please check your input data." conf["nodes"].append(copy.deepcopy(conf["nodes"][0])) conf["nodes"][0]["node_type"] = "movie" conf["nodes"][1]["node_type"] = "movie_fake" From 4c0b9965377e286e5b13f2a7f714f85357e3888e Mon Sep 17 00:00:00 2001 From: jalencato Date: Fri, 15 Dec 2023 13:39:01 -0800 Subject: [PATCH 21/21] Update test_construct_graph.py --- tests/unit-tests/gconstruct/test_construct_graph.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit-tests/gconstruct/test_construct_graph.py b/tests/unit-tests/gconstruct/test_construct_graph.py index 384d36fdd3..d7c9ae6650 100644 --- a/tests/unit-tests/gconstruct/test_construct_graph.py +++ b/tests/unit-tests/gconstruct/test_construct_graph.py @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import copy import random import os import tempfile @@ -1779,4 +1778,4 @@ def test_homogeneous(): test_multicolumn(None) test_multicolumn("/") test_feature_wrapper() - test_homogeneous() \ No newline at end of file + test_homogeneous()