From 2d7fa6e5c92ca03aed6baef1f90f41e47b48bfea Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 12 Feb 2024 11:48:13 -0800 Subject: [PATCH 1/9] Add end2end CI test for wholegraph --- .github/workflow_scripts/e2e_mgpu_check.sh | 2 + .../end2end-tests/graphstorm-lp/mgpu_test.sh | 85 +++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/.github/workflow_scripts/e2e_mgpu_check.sh b/.github/workflow_scripts/e2e_mgpu_check.sh index 48a90db8ff..f39f6c2bf0 100644 --- a/.github/workflow_scripts/e2e_mgpu_check.sh +++ b/.github/workflow_scripts/e2e_mgpu_check.sh @@ -1,6 +1,8 @@ # Move to parent directory cd ../../ +pip install install --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11 + set -ex sh ./tests/end2end-tests/setup.sh diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh index dee7ea2038..f6ea9e7c38 100644 --- a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh @@ -26,6 +26,91 @@ error_and_exit () { df /dev/shm -h +# wholegraph sparse embedding +echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model" +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +cnt=$(grep "Best Iteration" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Iteration" + exit -1 +fi + +cnt=$(ls -l /data/gsgnn_lp_ml_wg_dot/ | grep epoch | wc -l) +if test $cnt != 1 +then + echo "The number of save models $cnt is not equal to the specified topk 1" + exit -1 +fi + +best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch_dot" + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +echo "**************dataset: Movielens, do inference on saved model, decoder: dot" +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True + +error_and_exit $? + +bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +rm /tmp/log.txt + +python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction + +error_and_exit $? + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/infer-emb/ | grep rel_emb.pt | wc -l) +if test $cnt -ne 0 +then + echo "Dot product inference does not output edge embedding" + exit -1 +fi +rm -fr /data/gsgnn_lp_ml_wg_dot/infer-emb/ +rm /tmp/log.txt +rm /tmp/train_log.txt + echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model" python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True From 88117b33041f595475ac09798a4b30de4e44be8b Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 12 Feb 2024 13:22:41 -0800 Subject: [PATCH 2/9] Update --- .github/workflow_scripts/e2e_mgpu_check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflow_scripts/e2e_mgpu_check.sh b/.github/workflow_scripts/e2e_mgpu_check.sh index f39f6c2bf0..66417f78a7 100644 --- a/.github/workflow_scripts/e2e_mgpu_check.sh +++ b/.github/workflow_scripts/e2e_mgpu_check.sh @@ -1,7 +1,7 @@ # Move to parent directory cd ../../ -pip install install --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11 +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11 set -ex From fca52fefc27108e60cd6da762511eb8d71125291 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 12 Feb 2024 22:38:36 -0800 Subject: [PATCH 3/9] Enable nccl --- tests/end2end-tests/graphstorm-lp/mgpu_test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh index f6ea9e7c38..99abdebcb9 100644 --- a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh @@ -28,7 +28,7 @@ df /dev/shm -h # wholegraph sparse embedding echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model" -python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl # check prints cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l) @@ -84,7 +84,7 @@ then fi echo "**************dataset: Movielens, do inference on saved model, decoder: dot" -python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl error_and_exit $? From 9c74756e3b2849d76845b26f3fa32bd4fd372527 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 14 Feb 2024 16:19:25 -0800 Subject: [PATCH 4/9] Enable wholegraph sparse embedding for inference --- python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py | 6 ++++-- python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py | 10 ++++++++-- python/graphstorm/run/gsgnn_np/np_infer_gnn.py | 6 ++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py index 8a6bd6b98b..25f663781d 100644 --- a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py @@ -22,7 +22,7 @@ from graphstorm.inference import GSgnnEdgePredictionInferrer from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator from graphstorm.dataloading import GSgnnEdgeInferData, GSgnnEdgeDataLoader -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph def get_evaluator(config): # pylint: disable=unused-argument """ Get evaluator class @@ -43,7 +43,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnEdgeInferData(config.graph_name, diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py index 3683e2d284..073c529dbf 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py @@ -27,7 +27,11 @@ GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import ( + setup_device, + get_lm_ntypes, + use_wholegraph, +) def main(config_args): """ main function @@ -35,7 +39,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnEdgeInferData(config.graph_name, diff --git a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py index 10a84e9108..4d84a2ebda 100644 --- a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py @@ -21,7 +21,7 @@ from graphstorm.inference import GSgnnNodePredictionInferrer from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator from graphstorm.dataloading import GSgnnNodeInferData, GSgnnNodeDataLoader -from graphstorm.utils import setup_device, get_lm_ntypes +from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph def get_evaluator(config): # pylint: disable=unused-argument """ Get evaluator class @@ -42,7 +42,9 @@ def main(config_args): config = GSConfig(config_args) config.verify_arguments(False) - gs.initialize(ip_config=config.ip_config, backend=config.backend) + use_wg_feats = use_wholegraph(config.part_config) + gs.initialize(ip_config=config.ip_config, backend=config.backend, + use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats) device = setup_device(config.local_rank) infer_data = GSgnnNodeInferData(config.graph_name, From b8e6822a8fd1bcc6bc29602bc13f14afa6c5b35f Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 14 Feb 2024 17:41:15 -0800 Subject: [PATCH 5/9] Update --- python/graphstorm/model/embed.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/graphstorm/model/embed.py b/python/graphstorm/model/embed.py index 3465a55362..fd12b49759 100644 --- a/python/graphstorm/model/embed.py +++ b/python/graphstorm/model/embed.py @@ -151,6 +151,17 @@ def in_dims(self): """ return None + @property + def use_wholegraph_sparse_emb(self): + """ Whether or not to use WholeGraph to host embeddings for sparse updates. + + Note: By default, a GSNodeInputLayer does not support WholeGraph + sparse embedding, unless implemented specifically. + + Note: GSNodeEncoderInputLayer supports WholeGraph sparse embedding. + """ + return False + class GSNodeEncoderInputLayer(GSNodeInputLayer): """The input encoder layer for all nodes in a heterogeneous graph. From 00723194e206b45db886cf3dda456b2e6757a015 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 14 Feb 2024 19:53:51 -0800 Subject: [PATCH 6/9] Add end2end tests --- .../end2end-tests/graphstorm-ec/mgpu_test.sh | 53 +++++++ .../end2end-tests/graphstorm-lp/mgpu_test.sh | 147 ++++++++---------- .../end2end-tests/graphstorm-nc/mgpu_test.sh | 50 ++++++ 3 files changed, 165 insertions(+), 85 deletions(-) diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh index 219ce3d402..b8fde1219e 100644 --- a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh @@ -225,4 +225,57 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s error_and_exit $? rm -fr /data/gsgnn_ec/* +echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: full graph, exclude-training-targets: True, wholegraph learnable emb" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --use-node-embeddings true --multilabel true --num-classes 5 --use-mini-batch-infer false --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_ec/emb/ --save-model-path /data/gsgnn_wg_ec/ --save-model-frequency 1000 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_wg_ec/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_wg_ec/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + + +best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch" + +rm /tmp/train_log.txt + +echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml --multilabel true --num-classes 5 --use-node-embeddings true --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_ec/infer-emb/ --restore-model-path /data/gsgnn_wg_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_ec/prediction/ --logging-file /tmp/log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True + +error_and_exit $? + +bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +rm /tmp/log.txt + +cd $GS_HOME/tests/end2end-tests/ +python3 check_infer.py --train-embout /data/gsgnn_wg_ec/emb/ --infer-embout /data/gsgnn_wg_ec/infer-emb/ + +error_and_exit $? + +rm -fr /data/gsgnn_wg_ec/ rm -fr /tmp/* diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh index 299ef88de3..a5044844e2 100644 --- a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh @@ -26,91 +26,6 @@ error_and_exit () { df /dev/shm -h -# wholegraph sparse embedding -echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model" -python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl - -# check prints -cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l) -if test $cnt -lt 1 -then - echo "We use SageMaker task tracker, we should have save_embed_path" - exit -1 -fi - -cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l) -if test $cnt -lt 1 -then - echo "We use SageMaker task tracker, we should have save_model_path" - exit -1 -fi - -bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l) -if test $bst_cnt -lt 1 -then - echo "We use SageMaker task tracker, we should have Best Test mrr" - exit -1 -fi - -cnt=$(grep "Best Iteration" /tmp/train_log.txt | wc -l) -if test $cnt -lt 1 -then - echo "We use SageMaker task tracker, we should have Best Iteration" - exit -1 -fi - -cnt=$(ls -l /data/gsgnn_lp_ml_wg_dot/ | grep epoch | wc -l) -if test $cnt != 1 -then - echo "The number of save models $cnt is not equal to the specified topk 1" - exit -1 -fi - -best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) -echo "The best model is saved in epoch $best_epoch_dot" - -cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l) -if test $cnt != 4 -then - echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" - exit -1 -fi - -cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l) -if test $cnt != 4 -then - echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" - exit -1 -fi - -echo "**************dataset: Movielens, do inference on saved model, decoder: dot" -python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl - -error_and_exit $? - -bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l) -if test $bst_cnt -lt 1 -then - echo "We use SageMaker task tracker, we should have Best Test mrr" - exit -1 -fi - -rm /tmp/log.txt - -python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction - -error_and_exit $? - -cnt=$(ls /data/gsgnn_lp_ml_wg_dot/infer-emb/ | grep rel_emb.pt | wc -l) -if test $cnt -ne 0 -then - echo "Dot product inference does not output edge embedding" - exit -1 -fi -rm -fr /data/gsgnn_lp_ml_wg_dot/infer-emb/ -rm /tmp/log.txt -rm /tmp/train_log.txt - echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model" python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True @@ -634,4 +549,66 @@ error_and_exit $? rm -fr /data/gsgnn_lp_ml_hard_dot/* +# wholegraph sparse embedding +echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch_dot" + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l) +if test $cnt != 4 +then + echo "The number of sparse emb files $cnt is not equal to the number of gpus 4" + exit -1 +fi + +echo "**************dataset: Movielens, do inference on saved model, decoder: dot, wholegraph learnable emb" +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl + +error_and_exit $? + +bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test mrr" + exit -1 +fi + +python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction + +error_and_exit $? + +rm -fr /data/gsgnn_lp_ml_wg_dot/ rm -fr /tmp/* diff --git a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh index 53e0e50cc8..78396191ed 100644 --- a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh @@ -463,4 +463,54 @@ python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_s error_and_exit $? + +echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb" +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true + +error_and_exit $? + +# check prints +cnt=$(grep "save_embed_path: /data/gsgnn_wg_nc_ml/emb/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_embed_path" + exit -1 +fi + +cnt=$(grep "save_model_path: /data/gsgnn_wg_nc_ml/" /tmp/train_log.txt | wc -l) +if test $cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have save_model_path" + exit -1 +fi + +bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1) +echo "The best model is saved in epoch $best_epoch" + +rm /tmp/train_log.txt + +echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb" +python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true + +error_and_exit $? + +bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l) +if test $bst_cnt -lt 1 +then + echo "We use SageMaker task tracker, we should have Best Test accuracy" + exit -1 +fi + +python3 $GS_HOME/tests/end2end-tests/check_np_infer_emb.py --train-embout /data/gsgnn_wg_nc_ml/emb/ --infer-embout /data/gsgnn_wg_nc_ml/infer-emb/ + +error_and_exit $? + +rm -fr /data/gsgnn_wg_nc_ml/ rm -fr /tmp/* From c81d2479a8a8ed4db9777d787ecf32569459b994 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 15 Feb 2024 11:21:16 -0800 Subject: [PATCH 7/9] Update --- tests/end2end-tests/graphstorm-nc/mgpu_test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh index 78396191ed..2e141ae149 100644 --- a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh @@ -465,7 +465,7 @@ error_and_exit $? echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb" -python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true +python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True error_and_exit $? @@ -497,7 +497,7 @@ echo "The best model is saved in epoch $best_epoch" rm /tmp/train_log.txt echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb" -python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true +python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True error_and_exit $? From 1457f22bbc450901dc24267a96b187b704e688f4 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 15 Feb 2024 15:25:58 -0800 Subject: [PATCH 8/9] Update --- tests/end2end-tests/check_np_infer_emb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/end2end-tests/check_np_infer_emb.py b/tests/end2end-tests/check_np_infer_emb.py index 4563df490f..f6c1d21c96 100644 --- a/tests/end2end-tests/check_np_infer_emb.py +++ b/tests/end2end-tests/check_np_infer_emb.py @@ -106,7 +106,7 @@ # train nids [0, 1, 2, ...] train_emb = train_emb[th.argsort(train_nids)] for nid, inf_emb in zip(infer_nids, infer_emb): - assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=2) + assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=1) train_remap_embs = {} for nid, train_emb in zip (train_remaped_nids, train_remaped_emb): From 6f6fdedc9aea7895acbbc487a219eef05605adea Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 15 Feb 2024 16:29:47 -0800 Subject: [PATCH 9/9] Update --- tests/end2end-tests/check_np_infer_emb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/end2end-tests/check_np_infer_emb.py b/tests/end2end-tests/check_np_infer_emb.py index f6c1d21c96..315597328f 100644 --- a/tests/end2end-tests/check_np_infer_emb.py +++ b/tests/end2end-tests/check_np_infer_emb.py @@ -112,4 +112,4 @@ for nid, train_emb in zip (train_remaped_nids, train_remaped_emb): train_remap_embs[nid] = train_emb for nid, inf_emb in zip(infer_remaped_nids, infer_remaped_emb): - assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=2) + assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=1)