add e2e test

awslabs · Dec 11, 2023 · 98038fd · 98038fd
1 parent 3de2100
commit 98038fd
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 4 deletions.
diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py
@@ -582,7 +582,7 @@ def process_edge_data(process_confs, node_id_map, arr_merger,
 
     return (edges, edge_data, label_stats)
 
-def verify_confs(confs, args=None):
+def verify_confs(confs, rev_edges):
     """ Verify the configuration of the input data.
     """
     if "version" not in confs:
@@ -592,7 +592,7 @@ def verify_confs(confs, args=None):
     ntypes = {conf['node_type'] for conf in confs["nodes"]}
     etypes = [conf['relation'] for conf in confs["edges"]]
     # Adjust input to DGL requirement if it is a honogeneous graph
-    if len(ntypes) == 1 and len(etypes) == 1 and not args.add_reverse_edges:
+    if len(ntypes) == 1 and len(etypes) == 1 and not rev_edges:
         assert etypes[0][0] in ntypes, \
             f"source node type {etypes[0][0]} does not exist. Please check your input data."
         assert etypes[0][2] in ntypes, \
@@ -678,7 +678,7 @@ def process_graph(args):
             if args.num_processes_for_nodes is not None else args.num_processes
     num_processes_for_edges = args.num_processes_for_edges \
             if args.num_processes_for_edges is not None else args.num_processes
-    verify_confs(process_confs, args)
+    verify_confs(process_confs, args.add_reverse_edges)
     output_format = args.output_format
     for out_format in output_format:
         assert out_format in ["DGL", "DistDGL"], \

diff --git a/tests/end2end-tests/data_gen/movielens_homo.json b/tests/end2end-tests/data_gen/movielens_homo.json
@@ -0,0 +1,44 @@
+{
+    "version": "gconstruct-v0.1",
+    "nodes": [
+            {
+                    "node_id_col":  "id",
+                    "node_type":    "movie",
+                    "format":       {"name": "parquet"},
+                    "files":        "/data/ml-100k/movie.parquet",
+                    "features":     [
+                        {
+                                "feature_col":  "title",
+                                "transform":    {
+                                        "name": "bert_hf",
+                                        "bert_model": "bert-base-uncased",
+                                        "max_seq_length": 16
+                                }
+                        }
+                 ],
+                    "labels":	[
+                        {
+                            "label_col":	"label",
+                            "task_type":	"classification",
+                            "split_pct":	[0.8, 0.1, 0.1]
+                        }
+                    ]
+            }
+    ],
+    "edges": [
+            {
+                    "source_id_col":    "src_id",
+                    "dest_id_col":      "dst_id",
+                    "relation":         ["movie", "rating", "movie"],
+                    "format":           {"name": "parquet"},
+                    "files":        "/data/ml-100k/edges_homo.parquet",
+                    "labels":	[
+                        {
+                            "label_col":	"rate",
+                            "task_type":	"classification",
+                            "split_pct":	[0.1, 0.1, 0.1]
+                        }
+                    ]
+            }
+    ]
+}
diff --git a/tests/end2end-tests/data_gen/process_movielens.py b/tests/end2end-tests/data_gen/process_movielens.py
@@ -90,6 +90,11 @@ def write_data_parquet(data, data_file):
 edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]}
 write_data_parquet(edge_data, '/data/ml-100k/edges.parquet')
 
+# generate data for homogeneous optimization test
+edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None)
+edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]}
+write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet')
+
 # generate synthetic user data with label
 user_labels = np.random.randint(11, size=feat.shape[0])
 user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels}

diff --git a/tests/end2end-tests/data_process/movielens_test.sh b/tests/end2end-tests/data_process/movielens_test.sh
@@ -5,8 +5,9 @@ service ssh restart
 GS_HOME=$(pwd)
 NUM_TRAINERS=4
 export PYTHONPATH=$GS_HOME/python/
+cd $GS_HOME/training_scripts/gsgnn_np
+echo "127.0.0.1" > ip_list.txt
 cd $GS_HOME/training_scripts/gsgnn_ep
-
 echo "127.0.0.1" > ip_list.txt
 
 error_and_exit () {
@@ -27,6 +28,13 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2
 
 error_and_exit $?
 
+echo "********* Test Homogeneous Graph Optimization ********"
+python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k
+error_and_exit $?
+
+python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N
+error_and_exit $?
+
 echo "********* Test the DistDGL graph format with BERT embeddings ********"
 python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges