diff --git a/.github/workflow_scripts/e2e_mgpu_check.sh b/.github/workflow_scripts/e2e_mgpu_check.sh index 48a90db8ff..66417f78a7 100644 --- a/.github/workflow_scripts/e2e_mgpu_check.sh +++ b/.github/workflow_scripts/e2e_mgpu_check.sh @@ -1,6 +1,8 @@ # Move to parent directory cd ../../ +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11 + set -ex sh ./tests/end2end-tests/setup.sh diff --git a/python/graphstorm/model/embed.py b/python/graphstorm/model/embed.py index 3465a55362..fd12b49759 100644 --- a/python/graphstorm/model/embed.py +++ b/python/graphstorm/model/embed.py @@ -151,6 +151,17 @@ def in_dims(self): """ return None + @property + def use_wholegraph_sparse_emb(self): + """ Whether or not to use WholeGraph to host embeddings for sparse updates. + + Note: By default, a GSNodeInputLayer does not support WholeGraph + sparse embedding, unless implemented specifically. + + Note: GSNodeEncoderInputLayer supports WholeGraph sparse embedding. + """ + return False + class GSNodeEncoderInputLayer(GSNodeInputLayer): """The input encoder layer for all nodes in a heterogeneous graph. diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py index 8a6bd6b98b..25f663781d 100644 --- a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py @@ -22,7 +22,7 @@ from graphstorm.inference import GSgnnEdgePredictionInferrer from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator from graphstorm.dataloading import GSgnnEdgeInferData, GSgnnEdgeDataLoader -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph def get_evaluator(config): # pylint: disable=unused-argument """ Get evaluator class @@ -43,7 +43,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnEdgeInferData(config.graph_name, diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py index 3683e2d284..073c529dbf 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py @@ -27,7 +27,11 @@ GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import ( + setup_device, + get_lm_ntypes, + use_wholegraph, +) def main(config_args): """ main function @@ -35,7 +39,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnEdgeInferData(config.graph_name, diff --git a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py index 10a84e9108..4d84a2ebda 100644 --- a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py @@ -21,7 +21,7 @@ from graphstorm.inference import GSgnnNodePredictionInferrer from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator from graphstorm.dataloading import GSgnnNodeInferData, GSgnnNodeDataLoader -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph def get_evaluator(config): # pylint: disable=unused-argument """ Get evaluator class @@ -42,7 +42,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnNodeInferData(config.graph_name, diff --git a/tests/end2end-tests/check_np_infer_emb.py b/tests/end2end-tests/check_np_infer_emb.py index 4563df490f..315597328f 100644 --- a/tests/end2end-tests/check_np_infer_emb.py +++ b/tests/end2end-tests/check_np_infer_emb.py @@ -106,10 +106,10 @@ # train nids [0, 1, 2, ...] train_emb = train_emb[th.argsort(train_nids)] for nid, inf_emb in zip(infer_nids, infer_emb): - assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=2) + assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=1) train_remap_embs = {} for nid, train_emb in zip (train_remaped_nids, train_remaped_emb): train_remap_embs[nid] = train_emb for nid, inf_emb in zip(infer_remaped_nids, infer_remaped_emb): - assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=2) + assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=1) diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh index 219ce3d402..b8fde1219e 100644 --- a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh @@ -225,4 +225,57 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s error_and_exit $? rm -fr /data/gsgnn_ec/* +echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: full graph, exclude-training-targets: True, wholegraph learnable emb" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --use-node-embeddings true --multilabel true --num-classes 5 --use-mini-batch-infer false --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_ec/emb/ --save-model-path /data/gsgnn_wg_ec/ --save-model-frequency 1000 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_wg_ec/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_wg_ec/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + + +best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch" + +rm /tmp/train_log.txt + +echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml --multilabel true --num-classes 5 --use-node-embeddings true --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_ec/infer-emb/ --restore-model-path /data/gsgnn_wg_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_ec/prediction/ --logging-file /tmp/log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True + +error_and_exit $? + +bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +rm /tmp/log.txt + +cd $GS_HOME/tests/end2end-tests/ +python3 check_infer.py --train-embout /data/gsgnn_wg_ec/emb/ --infer-embout /data/gsgnn_wg_ec/infer-emb/ + +error_and_exit $? + +rm -fr /data/gsgnn_wg_ec/ rm -fr /tmp/* diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh index e6531a8dfe..a5044844e2 100644 --- a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh @@ -549,4 +549,66 @@ error_and_exit $? rm -fr /data/gsgnn_lp_ml_hard_dot/* +# wholegraph sparse embedding +echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch_dot" + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +echo "**************dataset: Movielens, do inference on saved model, decoder: dot, wholegraph learnable emb" +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl + +error_and_exit $? + +bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction + +error_and_exit $? + +rm -fr /data/gsgnn_lp_ml_wg_dot/ rm -fr /tmp/* diff --git a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh index 53e0e50cc8..2e141ae149 100644 --- a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh @@ -463,4 +463,54 @@ python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_s error_and_exit $? + +echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb" +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_wg_nc_ml/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_wg_nc_ml/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch" + +rm /tmp/train_log.txt + +echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True + +error_and_exit $? + +bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +python3 $GS_HOME/tests/end2end-tests/check_np_infer_emb.py --train-embout /data/gsgnn_wg_nc_ml/emb/ --infer-embout /data/gsgnn_wg_nc_ml/infer-emb/ + +error_and_exit $? + +rm -fr /data/gsgnn_wg_nc_ml/ rm -fr /tmp/*