Skip to content

Commit

Permalink
Add end2end CI test for wholegraph (#732)
Browse files Browse the repository at this point in the history
*Issue #, if available:*
There is no end2end test for wholegraph sparse embedding.

*Description of changes:*
Add test scripts.


By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

---------

Co-authored-by: Xiang Song <[email protected]>
  • Loading branch information
classicsong and Xiang Song authored Feb 16, 2024
1 parent b070910 commit b4c3511
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .github/workflow_scripts/e2e_mgpu_check.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Move to parent directory
cd ../../

pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11

set -ex

sh ./tests/end2end-tests/setup.sh
Expand Down
11 changes: 11 additions & 0 deletions python/graphstorm/model/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ def in_dims(self):
"""
return None

@property
def use_wholegraph_sparse_emb(self):
""" Whether or not to use WholeGraph to host embeddings for sparse updates.
Note: By default, a GSNodeInputLayer does not support WholeGraph
sparse embedding, unless implemented specifically.
Note: GSNodeEncoderInputLayer supports WholeGraph sparse embedding.
"""
return False


class GSNodeEncoderInputLayer(GSNodeInputLayer):
"""The input encoder layer for all nodes in a heterogeneous graph.
Expand Down
6 changes: 4 additions & 2 deletions python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from graphstorm.inference import GSgnnEdgePredictionInferrer
from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
from graphstorm.dataloading import GSgnnEdgeInferData, GSgnnEdgeDataLoader
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph

def get_evaluator(config): # pylint: disable=unused-argument
""" Get evaluator class
Expand All @@ -43,7 +43,9 @@ def main(config_args):
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnEdgeInferData(config.graph_name,
Expand Down
10 changes: 8 additions & 2 deletions python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,21 @@
GSgnnLinkPredictionPredefinedTestDataLoader)
from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER
from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import (
setup_device,
get_lm_ntypes,
use_wholegraph,
)

def main(config_args):
""" main function
"""
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnEdgeInferData(config.graph_name,
Expand Down
6 changes: 4 additions & 2 deletions python/graphstorm/run/gsgnn_np/np_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from graphstorm.inference import GSgnnNodePredictionInferrer
from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
from graphstorm.dataloading import GSgnnNodeInferData, GSgnnNodeDataLoader
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph

def get_evaluator(config): # pylint: disable=unused-argument
""" Get evaluator class
Expand All @@ -42,7 +42,9 @@ def main(config_args):
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnNodeInferData(config.graph_name,
Expand Down
4 changes: 2 additions & 2 deletions tests/end2end-tests/check_np_infer_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@
# train nids [0, 1, 2, ...]
train_emb = train_emb[th.argsort(train_nids)]
for nid, inf_emb in zip(infer_nids, infer_emb):
assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=2)
assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=1)

train_remap_embs = {}
for nid, train_emb in zip (train_remaped_nids, train_remaped_emb):
train_remap_embs[nid] = train_emb
for nid, inf_emb in zip(infer_remaped_nids, infer_remaped_emb):
assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=2)
assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=1)
53 changes: 53 additions & 0 deletions tests/end2end-tests/graphstorm-ec/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -225,4 +225,57 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s
error_and_exit $?
rm -fr /data/gsgnn_ec/*

echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: full graph, exclude-training-targets: True, wholegraph learnable emb"
python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --use-node-embeddings true --multilabel true --num-classes 5 --use-mini-batch-infer false --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_ec/emb/ --save-model-path /data/gsgnn_wg_ec/ --save-model-frequency 1000 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_wg_ec/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_wg_ec/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi


best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch"

rm /tmp/train_log.txt

echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml --multilabel true --num-classes 5 --use-node-embeddings true --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_ec/infer-emb/ --restore-model-path /data/gsgnn_wg_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_ec/prediction/ --logging-file /tmp/log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True

error_and_exit $?

bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

rm /tmp/log.txt

cd $GS_HOME/tests/end2end-tests/
python3 check_infer.py --train-embout /data/gsgnn_wg_ec/emb/ --infer-embout /data/gsgnn_wg_ec/infer-emb/

error_and_exit $?

rm -fr /data/gsgnn_wg_ec/
rm -fr /tmp/*
62 changes: 62 additions & 0 deletions tests/end2end-tests/graphstorm-lp/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -549,4 +549,66 @@ error_and_exit $?

rm -fr /data/gsgnn_lp_ml_hard_dot/*

# wholegraph sparse embedding
echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test mrr"
exit -1
fi

best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch_dot"

cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l)
if test $cnt != 4
then
echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
exit -1
fi

cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l)
if test $cnt != 4
then
echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
exit -1
fi

echo "**************dataset: Movielens, do inference on saved model, decoder: dot, wholegraph learnable emb"
python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl

error_and_exit $?

bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test mrr"
exit -1
fi

python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction

error_and_exit $?

rm -fr /data/gsgnn_lp_ml_wg_dot/
rm -fr /tmp/*
50 changes: 50 additions & 0 deletions tests/end2end-tests/graphstorm-nc/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -463,4 +463,54 @@ python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_s

error_and_exit $?


echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb"
python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_wg_nc_ml/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_wg_nc_ml/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch"

rm /tmp/train_log.txt

echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True

error_and_exit $?

bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

python3 $GS_HOME/tests/end2end-tests/check_np_infer_emb.py --train-embout /data/gsgnn_wg_nc_ml/emb/ --infer-embout /data/gsgnn_wg_nc_ml/infer-emb/

error_and_exit $?

rm -fr /data/gsgnn_wg_nc_ml/
rm -fr /tmp/*

0 comments on commit b4c3511

Please sign in to comment.