Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Command Line for Embedding Generating #525

Merged
merged 43 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
1d6432f
initial commit
jalencato Oct 3, 2023
a841911
first commit - no test
jalencato Oct 3, 2023
7607069
remove unnecessary dependency
jalencato Oct 3, 2023
4621c6e
change config
jalencato Oct 3, 2023
d8c0309
fix lint
jalencato Oct 3, 2023
323dbb0
test
jalencato Oct 4, 2023
05807e3
fix save_embed path
jalencato Oct 4, 2023
6514afe
add test
jalencato Oct 4, 2023
249df54
fix lint
jalencato Oct 4, 2023
c9de5e7
temp fix
jalencato Oct 4, 2023
2b25576
fix
jalencato Oct 4, 2023
63fbb6f
fix typo
jalencato Oct 4, 2023
b2ac45b
fix test
jalencato Oct 5, 2023
699dafa
fix test
jalencato Oct 5, 2023
c8991b0
fix
jalencato Oct 5, 2023
3442970
change test
jalencato Oct 5, 2023
273846c
rename the gs_gen_embedding to ge_gen_node_embedding
jalencato Oct 5, 2023
ce05d94
fix test
jalencato Oct 5, 2023
6196d19
Update mgpu_test.sh
jalencato Oct 5, 2023
264c80e
fix bug
jalencato Oct 5, 2023
96bdaf8
fix
jalencato Oct 5, 2023
ebe0d4b
fix embedding bug on link prediction
jalencato Oct 5, 2023
70feebd
use entire graph for embedding generation
jalencato Oct 6, 2023
a38df65
fix whole code structure
jalencato Oct 9, 2023
f642cf0
fix import bug
jalencato Oct 9, 2023
34e22e7
fix lint
jalencato Oct 9, 2023
0d9347f
fix lint
jalencato Oct 9, 2023
3c30b1b
fix bug for not restoring model
jalencato Oct 9, 2023
9c0be98
remove relation embedding
jalencato Oct 11, 2023
6822ef3
remove redundant dependency
jalencato Oct 11, 2023
2b79a47
fix lint
jalencato Oct 11, 2023
9c119de
change to trival version
jalencato Oct 12, 2023
45038f9
add doc string
jalencato Oct 12, 2023
6ceb0d0
fix edge task mini batch
jalencato Oct 12, 2023
5e39786
add
jalencato Oct 12, 2023
5704472
fix sorted bug
jalencato Oct 12, 2023
6de76ff
finish pruning
jalencato Oct 13, 2023
788297a
fix typo
jalencato Oct 13, 2023
0cd315f
apply comment
jalencato Oct 13, 2023
ada55ae
test
jalencato Oct 13, 2023
774187c
add embs
jalencato Oct 13, 2023
017e119
Merge branch 'main' into gen_embedding
jalencato Oct 13, 2023
6906543
fix typo
jalencato Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions python/graphstorm/run/gs_gen_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
zheng-da marked this conversation as resolved.
Show resolved Hide resolved
Copyright 2023 Contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Entry point for running link prediction tasks.

Run as:
python3 -m graphstorm.run.gs_gen_embedding <Launch args>
"""
import os
import logging

from .launch import get_argument_parser
from .launch import check_input_arguments
from .launch import submit_jobs

def main():
""" Main function
"""
parser = get_argument_parser()
args, exec_script_args = parser.parse_known_args()
check_input_arguments(args)

lib_dir = os.path.abspath(os.path.dirname(__file__))
cmd = "gsgnn_emb/gsgnn_emb.py"
cmd_path = os.path.join(lib_dir, cmd)
exec_script_args = [cmd_path] + exec_script_args

if "coo" not in args.graph_format:
args.graph_format = f"{args.graph_format},coo"
logging.debug("Automatically add COO format to graph formats for link prediction. " + \
"New graph_format is %s", args.graph_format)
submit_jobs(args, exec_script_args)

if __name__ == "__main__":
FMT = "%(asctime)s %(levelname)s %(message)s"
logging.basicConfig(format=FMT, level=logging.INFO)
main()

Empty file.
111 changes: 111 additions & 0 deletions python/graphstorm/run/gsgnn_emb/gsgnn_emb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Copyright 2023 Contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

GSgnn pure gpu generate embeddings.
"""
import graphstorm as gs
from graphstorm.config import get_argument_parser
from graphstorm.config import GSConfig
from graphstorm.dataloading import GSgnnLPTrainData, GSgnnNodeTrainData, GSgnnEdgeTrainData
from graphstorm.model.utils import save_embeddings
from graphstorm.model import do_full_graph_inference
from graphstorm.utils import rt_profiler, sys_tracker, setup_device, use_wholegraph
from graphstorm.config import (BUILTIN_TASK_NODE_CLASSIFICATION,
BUILTIN_TASK_NODE_REGRESSION,
BUILTIN_TASK_EDGE_CLASSIFICATION,
BUILTIN_TASK_EDGE_REGRESSION,
BUILTIN_TASK_LINK_PREDICTION)

def main(config_args):
""" main function
"""
config = GSConfig(config_args)
config.verify_arguments(True)

gs.initialize(ip_config=config.ip_config, backend=config.backend,
use_wholegraph=use_wholegraph(config.part_config))
rt_profiler.init(config.profile_path, rank=gs.get_rank())
sys_tracker.init(config.verbose, rank=gs.get_rank())
device = setup_device(config.local_rank)
tracker = gs.create_builtin_task_tracker(config)
if gs.get_rank() == 0:
tracker.log_params(config.__dict__)

if config.task_type == BUILTIN_TASK_LINK_PREDICTION:
train_data = GSgnnLPTrainData(config.graph_name,
config.part_config,
train_etypes=config.train_etype,
eval_etypes=config.eval_etype,
node_feat_field=config.node_feat_name,
pos_graph_feat_field=config.lp_edge_weight_for_loss)
elif config.task_type == BUILTIN_TASK_NODE_REGRESSION or BUILTIN_TASK_NODE_CLASSIFICATION:
train_data = GSgnnNodeTrainData(config.graph_name,
config.part_config,
train_ntypes=config.target_ntype,
eval_ntypes=config.eval_target_ntype,
node_feat_field=config.node_feat_name,
label_field=config.label_field)
elif config.task_type == BUILTIN_TASK_EDGE_CLASSIFICATION or BUILTIN_TASK_EDGE_REGRESSION:
train_data = GSgnnEdgeTrainData(config.graph_name,
config.part_config,
train_etypes=config.target_etype,
node_feat_field=config.node_feat_name,
label_field=config.label_field,
decoder_edge_feat=config.decoder_edge_feat)
else:
raise TypeError("Not supported for task type: ", config.task_type)

# assert the setting for the graphstorm embedding generation.
assert config.save_embed_path is not None, \
"save embeded path cannot be none for gs_gen_embeddings"
assert config.restore_model_path is not None, \
"restore model path cannot be none for gs_gen_embeddings"

if config.task_type == BUILTIN_TASK_LINK_PREDICTION or not config.task_type:
model = gs.create_builtin_lp_gnn_model(train_data.g, config, train_task=False)
elif config.task_type == BUILTIN_TASK_NODE_REGRESSION or BUILTIN_TASK_NODE_CLASSIFICATION:
model = gs.create_builtin_node_gnn_model(train_data.g, config, train_task=False)
elif config.task_type == BUILTIN_TASK_EDGE_CLASSIFICATION or BUILTIN_TASK_EDGE_REGRESSION:
model = gs.create_builtin_edge_gnn_model(train_data.g, config, train_task=False)

model_path = config.restore_model_path
# TODO(zhengda) the model path has to be in a shared filesystem.
model.restore_model(model_path)
# Preparing input layer for training or inference.
# The input layer can pre-compute node features in the preparing step if needed.
# For example pre-compute all BERT embeddings
model.prepare_input_encoder(train_data)
# TODO(zhengda) we may not want to only use training edges to generate GNN embeddings.
embeddings = do_full_graph_inference(model, train_data, fanout=config.eval_fanout,
task_tracker=tracker)
save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
gs.get_world_size(),
device=device,
node_id_mapping_file=config.node_id_mapping_file,
save_embed_format=config.save_embed_format)


def generate_parser():
""" Generate an argument parser
"""
parser = get_argument_parser()
return parser


if __name__ == '__main__':
arg_parser = generate_parser()

args = arg_parser.parse_args()
main(args)
3 changes: 2 additions & 1 deletion python/graphstorm/run/gsgnn_ep/gsgnn_ep.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ def main(config_args):
save_embeddings(config.save_embed_path, embs, gs.get_rank(),
jalencato marked this conversation as resolved.
Show resolved Hide resolved
gs.get_world_size(),
device=device,
node_id_mapping_file=config.node_id_mapping_file)
node_id_mapping_file=config.node_id_mapping_file,
save_embed_format=config.save_embed_format)

def generate_parser():
""" Generate an argument parser
Expand Down
3 changes: 2 additions & 1 deletion python/graphstorm/run/gsgnn_lp/gsgnn_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def main(config_args):
save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
gs.get_world_size(),
device=device,
node_id_mapping_file=config.node_id_mapping_file)
node_id_mapping_file=config.node_id_mapping_file,
save_embed_format=config.save_embed_format)

def generate_parser():
""" Generate an argument parser
Expand Down
3 changes: 2 additions & 1 deletion python/graphstorm/run/gsgnn_np/gsgnn_np.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ def main(config_args):
save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
gs.get_world_size(),
device=device,
node_id_mapping_file=config.node_id_mapping_file)
node_id_mapping_file=config.node_id_mapping_file,
save_embed_format=config.save_embed_format)

def generate_parser():
""" Generate an argument parser
Expand Down
2 changes: 1 addition & 1 deletion python/graphstorm/run/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def check_input_arguments(args):
), "--num-servers must be a positive number."
assert (
args.part_config is not None
), "A user has to specify a partition configuration file with --part-onfig."
), "A user has to specify a partition configuration file with --part-config."
assert (
args.ip_config is not None
), "A user has to specify an IP configuration file with --ip-config."
Expand Down
9 changes: 9 additions & 0 deletions tests/end2end-tests/graphstorm-ec/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,15 @@ python3 check_infer.py --train_embout /data/gsgnn_ec/emb/ --infer_embout /data/g

error_and_exit $?

echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on edge classification"
python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --restore-model-path /data/gsgnn_ec/epoch-$best_epoch/ --save-embed-path /data/gsgnn_ec/save-emb/ --logging-file /tmp/train_log.txt --logging-level debug

error_and_exit $?

python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_ec/emb/ --infer_embout /data/gsgnn_ec/save-emb/

error_and_exit $?

echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model without test_mask"
python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_no_test_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer false --save-embed-path /data/gsgnn_ec/infer-emb/ --restore-model-path /data/gsgnn_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_ec/prediction/ --no-validation true

Expand Down
9 changes: 9 additions & 0 deletions tests/end2end-tests/graphstorm-lp/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,15 @@ then
fi
rm -fr /data/gsgnn_lp_ml_dot/infer-emb/

echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on link prediction"
python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-node-embeddings true --restore-model-path /data/gsgnn_lp_ml_distmult/epoch-$best_epoch_dot/ --save-embed-path /data/gsgnn_lp_ml_distmult/save-emb/ --lp-decoder-type distmult --train-etype user,rating,movie movie,rating-rev,user --logging-file /tmp/train_log.txt --logging-level debug

error_and_exit $?

python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_lp_ml_dot/emb/ --infer_embout /data/gsgnn_lp_ml_dot/save-emb/

error_and_exit $?

echo "**************dataset: Movielens, do mini-batch inference on saved model, decoder: dot"
python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_dot/epoch-$best_epoch_dot/ --use-mini-batch-infer true --logging-file /tmp/log.txt

Expand Down
9 changes: 9 additions & 0 deletions tests/end2end-tests/graphstorm-nc/mgpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_n

error_and_exit $?

echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on node classification"
python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --restore-model-path /data/gsgnn_nc_ml/epoch-$best_epoch/ --save-embed-path /data/gsgnn_nc_ml/save-emb --logging-file /tmp/train_log.txt --logging-level debug

error_and_exit $?

python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_nc_ml/emb/ --infer_embout /data/gsgnn_nc_ml/save-emb/

error_and_exit $?

echo "**************dataset: Movielens, do inference on saved model with mini-batch-infer without test mask"
python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_notest_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer true --save-embed-path /data/gsgnn_nc_ml/mini-infer-emb/ --restore-model-path /data/gsgnn_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_nc_ml/prediction/ --no-validation true

Expand Down
Loading