Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add end2end CI test for wholegraph #732

Merged
merged 14 commits into from
Feb 16, 2024
2 changes: 2 additions & 0 deletions .github/workflow_scripts/e2e_mgpu_check.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Move to parent directory
cd ../../

pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pylibwholegraph-cu11

set -ex

sh ./tests/end2end-tests/setup.sh
Expand Down
11 changes: 11 additions & 0 deletions python/graphstorm/model/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ def in_dims(self):
"""
return None

@property
def use_wholegraph_sparse_emb(self):
""" Whether or not to use WholeGraph to host embeddings for sparse updates.
Note: By default, a GSNodeInputLayer does not support WholeGraph
sparse embedding, unless implemented specifically.
Note: GSNodeEncoderInputLayer supports WholeGraph sparse embedding.
"""
return False


class GSNodeEncoderInputLayer(GSNodeInputLayer):
"""The input encoder layer for all nodes in a heterogeneous graph.
Expand Down
6 changes: 4 additions & 2 deletions python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from graphstorm.inference import GSgnnEdgePredictionInferrer
from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
from graphstorm.dataloading import GSgnnEdgeInferData, GSgnnEdgeDataLoader
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph

def get_evaluator(config): # pylint: disable=unused-argument
""" Get evaluator class
Expand All @@ -43,7 +43,9 @@ def main(config_args):
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnEdgeInferData(config.graph_name,
Expand Down
10 changes: 8 additions & 2 deletions python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,21 @@
GSgnnLinkPredictionPredefinedTestDataLoader)
from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER
from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import (
setup_device,
get_lm_ntypes,
use_wholegraph,
)

def main(config_args):
""" main function
"""
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnEdgeInferData(config.graph_name,
Expand Down
6 changes: 4 additions & 2 deletions python/graphstorm/run/gsgnn_np/np_infer_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from graphstorm.inference import GSgnnNodePredictionInferrer
from graphstorm.eval import GSgnnAccEvaluator, GSgnnRegressionEvaluator
from graphstorm.dataloading import GSgnnNodeInferData, GSgnnNodeDataLoader
from graphstorm.utils import setup_device, get_lm_ntypes
from graphstorm.utils import setup_device, get_lm_ntypes, use_wholegraph

def get_evaluator(config): # pylint: disable=unused-argument
""" Get evaluator class
Expand All @@ -42,7 +42,9 @@ def main(config_args):
config = GSConfig(config_args)
config.verify_arguments(False)

gs.initialize(ip_config=config.ip_config, backend=config.backend)
use_wg_feats = use_wholegraph(config.part_config)
gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=config.use_wholegraph_sparse_emb or use_wg_feats)
device = setup_device(config.local_rank)

infer_data = GSgnnNodeInferData(config.graph_name,
Expand Down
4 changes: 2 additions & 2 deletions tests/end2end-tests/check_np_infer_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@
# train nids [0, 1, 2, ...]
train_emb = train_emb[th.argsort(train_nids)]
for nid, inf_emb in zip(infer_nids, infer_emb):
assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=2)
assert_almost_equal(train_emb[nid].numpy(), inf_emb.numpy(), decimal=1)

train_remap_embs = {}
for nid, train_emb in zip (train_remaped_nids, train_remaped_emb):
train_remap_embs[nid] = train_emb
for nid, inf_emb in zip(infer_remaped_nids, infer_remaped_emb):
assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=2)
assert_almost_equal(train_remap_embs[nid], inf_emb, decimal=1)
53 changes: 53 additions & 0 deletions tests/end2end-tests/graphstorm-ec/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -225,4 +225,57 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s
error_and_exit $?
rm -fr /data/gsgnn_ec/*

echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: full graph, exclude-training-targets: True, wholegraph learnable emb"
python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --use-node-embeddings true --multilabel true --num-classes 5 --use-mini-batch-infer false --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_ec/emb/ --save-model-path /data/gsgnn_wg_ec/ --save-model-frequency 1000 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_wg_ec/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_wg_ec/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi


best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch"

rm /tmp/train_log.txt

echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml --multilabel true --num-classes 5 --use-node-embeddings true --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_ec/infer-emb/ --restore-model-path /data/gsgnn_wg_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_ec/prediction/ --logging-file /tmp/log.txt --logging-level debug --preserve-input True --backend nccl --use-wholegraph-sparse-emb True

error_and_exit $?

bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

rm /tmp/log.txt

cd $GS_HOME/tests/end2end-tests/
python3 check_infer.py --train-embout /data/gsgnn_wg_ec/emb/ --infer-embout /data/gsgnn_wg_ec/infer-emb/

error_and_exit $?

rm -fr /data/gsgnn_wg_ec/
rm -fr /tmp/*
62 changes: 62 additions & 0 deletions tests/end2end-tests/graphstorm-lp/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -549,4 +549,66 @@ error_and_exit $?

rm -fr /data/gsgnn_lp_ml_hard_dot/*

# wholegraph sparse embedding
echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT & sparse embed, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_wg_dot/ --topk-model-to-save 1 --save-model-frequency 1000 --save-embed-path /data/gsgnn_lp_ml_wg_dot/emb/ --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --use-wholegraph-sparse-emb True --backend nccl

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_lp_ml_wg_dot/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_lp_ml_wg_dot/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test mrr" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test mrr"
exit -1
fi

best_epoch_dot=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch_dot"

cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/user/ | wc -l)
if test $cnt != 4
then
echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
exit -1
fi

cnt=$(ls /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/movie/ | wc -l)
if test $cnt != 4
then
echo "The number of sparse emb files $cnt is not equal to the number of gpus 4"
exit -1
fi

echo "**************dataset: Movielens, do inference on saved model, decoder: dot, wholegraph learnable emb"
python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_wg_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_wg_dot/epoch-$best_epoch_dot/ --logging-file /tmp/log.txt --preserve-input True --use-wholegraph-sparse-emb True --backend nccl

error_and_exit $?

bst_cnt=$(grep "Best Test mrr" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test mrr"
exit -1
fi

python3 $GS_HOME/tests/end2end-tests/check_infer.py --train-embout /data/gsgnn_lp_ml_wg_dot/emb/ --infer-embout /data/gsgnn_lp_ml_wg_dot/infer-emb/ --link-prediction

error_and_exit $?

rm -fr /data/gsgnn_lp_ml_wg_dot/
rm -fr /tmp/*
50 changes: 50 additions & 0 deletions tests/end2end-tests/graphstorm-nc/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -463,4 +463,54 @@ python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_s

error_and_exit $?


echo "**************dataset: MovieLens classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch save model save emb node, wholegraph learnable emb"
python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --save-model-path /data/gsgnn_wg_nc_ml/ --topk-model-to-save 1 --save-embed-path /data/gsgnn_wg_nc_ml/emb/ --num-epochs 3 --logging-file /tmp/train_log.txt --logging-level debug --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True

error_and_exit $?

# check prints
cnt=$(grep "save_embed_path: /data/gsgnn_wg_nc_ml/emb/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_embed_path"
exit -1
fi

cnt=$(grep "save_model_path: /data/gsgnn_wg_nc_ml/" /tmp/train_log.txt | wc -l)
if test $cnt -lt 1
then
echo "We use SageMaker task tracker, we should have save_model_path"
exit -1
fi

bst_cnt=$(grep "Best Test accuracy" /tmp/train_log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

best_epoch=$(grep "successfully save the model to" /tmp/train_log.txt | tail -1 | tr -d '\n' | tail -c 1)
echo "The best model is saved in epoch $best_epoch"

rm /tmp/train_log.txt

echo "**************dataset: Movielens, do inference on saved model, wholegraph learnable emb"
python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer false --save-embed-path /data/gsgnn_wg_nc_ml/infer-emb/ --restore-model-path /data/gsgnn_wg_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_wg_nc_ml/prediction/ --logging-file /tmp/log.txt --preserve-input True --backend nccl --use-node-embeddings true --use-wholegraph-sparse-emb True

error_and_exit $?

bst_cnt=$(grep "Best Test accuracy" /tmp/log.txt | wc -l)
if test $bst_cnt -lt 1
then
echo "We use SageMaker task tracker, we should have Best Test accuracy"
exit -1
fi

python3 $GS_HOME/tests/end2end-tests/check_np_infer_emb.py --train-embout /data/gsgnn_wg_nc_ml/emb/ --infer-embout /data/gsgnn_wg_nc_ml/infer-emb/

error_and_exit $?

rm -fr /data/gsgnn_wg_nc_ml/
rm -fr /tmp/*
Loading