Add end2end CI test for wholegraph (#732)

*Issue #, if available:* There is no end2end test for wholegraph sparse embedding. *Description of changes:* Add test scripts. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Xiang Song <[email protected]>
awslabs · Feb 16, 2024 · b4c3511 · b4c3511
1 parent b070910
commit b4c3511
Show file tree

Hide file tree

Showing 9 changed files with 196 additions and 8 deletions.
diff --git a/.github/workflow_scripts/e2e_mgpu_check.sh b/.github/workflow_scripts/e2e_mgpu_check.sh
@@ -1,6 +1,8 @@
 # Move to parent directory
 cd ../../
 
+pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11
+
 set -ex
 
 sh ./tests/end2end-tests/setup.sh

diff --git a/python/graphstorm/model/embed.py b/python/graphstorm/model/embed.py
@@ -151,6 +151,17 @@ def in_dims(self):
         """
         return None
 
+    @property
+    def use_wholegraph_sparse_emb(self):
+        """ Whether or not to use WholeGraph to host embeddings for sparse updates.
+
+            Note: By default, a GSNodeInputLayer does not support WholeGraph
+            sparse embedding, unless implemented specifically.
+
+            Note: GSNodeEncoderInputLayer supports WholeGraph sparse embedding.
+        """
+        return False
+
 
 class GSNodeEncoderInputLayer(GSNodeInputLayer):
     """The input encoder layer for all nodes in a heterogeneous graph.

diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
@@ -22,7 +22,7 @@
 from graphstorm.inference import GSgnnEdgePredictionInferrer
 from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
 from graphstorm.dataloading import GSgnnEdgeInferData, GSgnnEdgeDataLoader
-from graphstorm.utils import setup_device, get_lm_ntypes
+from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph
 
 def get_evaluator(config): # pylint: disable=unused-argument
     """ Get evaluator class
@@ -43,7 +43,9 @@ def main(config_args):
     config = GSConfig(config_args)
     config.verify_arguments(False)
 
-    gs.initialize(ip_config=config.ip_config, backend=config.backend)
+    use_wg_feats = use_wholegraph(config.part_config)
+    gs.initialize(ip_config=config.ip_config, backend=config.backend,
+                  use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
     device = setup_device(config.local_rank)
 
     infer_data = GSgnnEdgeInferData(config.graph_name,

diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
@@ -27,15 +27,21 @@
                                     GSgnnLinkPredictionPredefinedTestDataLoader)
 from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER
 from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER
-from graphstorm.utils import setup_device, get_lm_ntypes
+from graphstorm.utils import (
+    setup_device,
+    get_lm_ntypes,
+    use_wholegraph,
+)
 
 def main(config_args):
     """ main function
     """
     config = GSConfig(config_args)
     config.verify_arguments(False)
 
-    gs.initialize(ip_config=config.ip_config, backend=config.backend)
+    use_wg_feats = use_wholegraph(config.part_config)
+    gs.initialize(ip_config=config.ip_config, backend=config.backend,
+                  use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
     device = setup_device(config.local_rank)
 
     infer_data = GSgnnEdgeInferData(config.graph_name,

diff --git a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py
@@ -21,7 +21,7 @@
 from graphstorm.inference import GSgnnNodePredictionInferrer
 from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
 from graphstorm.dataloading import GSgnnNodeInferData, GSgnnNodeDataLoader
-from graphstorm.utils import setup_device, get_lm_ntypes
+from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph
 
 def get_evaluator(config): # pylint: disable=unused-argument
     """ Get evaluator class
@@ -42,7 +42,9 @@ def main(config_args):
     config = GSConfig(config_args)
     config.verify_arguments(False)
 
-    gs.initialize(ip_config=config.ip_config, backend=config.backend)
+    use_wg_feats = use_wholegraph(config.part_config)
+    gs.initialize(ip_config=config.ip_config, backend=config.backend,
+                  use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
     device = setup_device(config.local_rank)
 
     infer_data = GSgnnNodeInferData(config.graph_name,

diff --git a/tests/end2end-tests/check_np_infer_emb.py b/tests/end2end-tests/check_np_infer_emb.py
@@ -106,10 +106,10 @@
         # train nids [0, 1, 2, ...]
         train_emb = train_emb[th.argsort(train_nids)]
         for nid, inf_emb in zip(infer_nids, infer_emb):
-            assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=2)
+            assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=1)
 
         train_remap_embs = {}
         for nid, train_emb in zip (train_remaped_nids, train_remaped_emb):
             train_remap_embs[nid] = train_emb
         for nid, inf_emb in zip(infer_remaped_nids, infer_remaped_emb):
-            assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=2)
+            assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=1)
diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh
@@ -225,4 +225,57 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s
 error_and_exit $?
 rm -fr /data/gsgnn_ec/*
 
+echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: full graph, exclude-training-targets: True, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --use-node-embeddings true --multilabel true --num-classes 5  --use-mini-batch-infer false --topk-model-to-save 1  --save-embed-path /data/gsgnn_wg_ec/emb/ --save-model-path /data/gsgnn_wg_ec/ --save-model-frequency 1000 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True
+
+error_and_exit $?
+
+# check prints
+cnt=$(grep "save_embed_path: /data/gsgnn_wg_ec/emb/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_embed_path"
+    exit -1
+fi
+
+cnt=$(grep "save_model_path: /data/gsgnn_wg_ec/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_model_path"
+    exit -1
+fi
+
+bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test accuracy"
+    exit -1
+fi
+
+
+best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
+echo "The best model is saved in epoch $best_epoch"
+
+rm /tmp/train_log.txt
+
+echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml  --multilabel true --num-classes 5 --use-node-embeddings true --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_ec/infer-emb/ --restore-model-path /data/gsgnn_wg_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_ec/prediction/ --logging-file /tmp/log.txt  --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True
+
+error_and_exit $?
+
+bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test accuracy"
+    exit -1
+fi
+
+rm /tmp/log.txt
+
+cd $GS_HOME/tests/end2end-tests/
+python3 check_infer.py --train-embout /data/gsgnn_wg_ec/emb/ --infer-embout /data/gsgnn_wg_ec/infer-emb/
+
+error_and_exit $?
+
+rm -fr /data/gsgnn_wg_ec/
 rm -fr /tmp/*
diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh
@@ -549,4 +549,66 @@ error_and_exit $?
 
 rm -fr /data/gsgnn_lp_ml_hard_dot/*
 
+# wholegraph sparse embedding
+echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false  --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie  --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True  --backend nccl
+
+error_and_exit $?
+
+# check prints
+cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_embed_path"
+    exit -1
+fi
+
+cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_model_path"
+    exit -1
+fi
+
+bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test mrr"
+    exit -1
+fi
+
+best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
+echo "The best model is saved in epoch $best_epoch_dot"
+
+cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l)
+if test $cnt != 4
+then
+    echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
+    exit -1
+fi
+
+cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l)
+if test $cnt != 4
+then
+    echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
+    exit -1
+fi
+
+echo "**************dataset: Movielens, do inference on saved model, decoder: dot, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True  --backend nccl
+
+error_and_exit $?
+
+bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test mrr"
+    exit -1
+fi
+
+python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction
+
+error_and_exit $?
+
+rm -fr /data/gsgnn_lp_ml_wg_dot/
 rm -fr /tmp/*
diff --git a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh
@@ -463,4 +463,54 @@ python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_s
 
 error_and_exit $?
 
+
+echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True
+
+error_and_exit $?
+
+# check prints
+cnt=$(grep "save_embed_path: /data/gsgnn_wg_nc_ml/emb/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_embed_path"
+    exit -1
+fi
+
+cnt=$(grep "save_model_path: /data/gsgnn_wg_nc_ml/" /tmp/train_log.txt | wc -l)
+if test $cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have save_model_path"
+    exit -1
+fi
+
+bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test accuracy"
+    exit -1
+fi
+
+best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
+echo "The best model is saved in epoch $best_epoch"
+
+rm /tmp/train_log.txt
+
+echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb"
+python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false  --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True
+
+error_and_exit $?
+
+bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
+if test $bst_cnt -lt 1
+then
+    echo "We use SageMaker task tracker, we should have Best Test accuracy"
+    exit -1
+fi
+
+python3 $GS_HOME/tests/end2end-tests/check_np_infer_emb.py --train-embout /data/gsgnn_wg_nc_ml/emb/ --infer-embout /data/gsgnn_wg_nc_ml/infer-emb/
+
+error_and_exit $?
+
+rm -fr /data/gsgnn_wg_nc_ml/
 rm -fr /tmp/*