awslabs · jalencato · Oct 13, 2023 · Oct 3, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/python/graphstorm/run/gs_gen_embedding.py b/python/graphstorm/run/gs_gen_embedding.py
@@ -0,0 +1,50 @@
+"""
+    Copyright 2023 Contributors
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    Entry point for running link prediction tasks.
+
+    Run as:
+    python3 -m graphstorm.run.gs_gen_embedding <Launch args>
+"""
+import os
+import logging
+
+from .launch import get_argument_parser
+from .launch import check_input_arguments
+from .launch import submit_jobs
+
+def main():
+    """ Main function
+    """
+    parser = get_argument_parser()
+    args, exec_script_args = parser.parse_known_args()
+    check_input_arguments(args)
+
+    lib_dir = os.path.abspath(os.path.dirname(__file__))
+    cmd = "gsgnn_emb/gsgnn_emb.py"
+    cmd_path = os.path.join(lib_dir, cmd)
+    exec_script_args = [cmd_path] + exec_script_args
+
+    if "coo" not in args.graph_format:
+        args.graph_format = f"{args.graph_format},coo"
+        logging.debug("Automatically add COO format to graph formats for link prediction. " + \
+                "New graph_format is %s", args.graph_format)
+    submit_jobs(args, exec_script_args)
+
+if __name__ == "__main__":
+    FMT = "%(asctime)s %(levelname)s %(message)s"
+    logging.basicConfig(format=FMT, level=logging.INFO)
+    main()
+
diff --git a/python/graphstorm/run/gsgnn_emb/__init__.py b/python/graphstorm/run/gsgnn_emb/__init__.py
diff --git a/python/graphstorm/run/gsgnn_emb/gsgnn_emb.py b/python/graphstorm/run/gsgnn_emb/gsgnn_emb.py
@@ -0,0 +1,111 @@
+"""
+    Copyright 2023 Contributors
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    GSgnn pure gpu generate embeddings.
+"""
+import graphstorm as gs
+from graphstorm.config import get_argument_parser
+from graphstorm.config import GSConfig
+from graphstorm.dataloading import GSgnnLPTrainData, GSgnnNodeTrainData, GSgnnEdgeTrainData
+from graphstorm.model.utils import save_embeddings
+from graphstorm.model import do_full_graph_inference
+from graphstorm.utils import rt_profiler, sys_tracker, setup_device, use_wholegraph
+from graphstorm.config import  (BUILTIN_TASK_NODE_CLASSIFICATION,
+                                BUILTIN_TASK_NODE_REGRESSION,
+                                BUILTIN_TASK_EDGE_CLASSIFICATION,
+                                BUILTIN_TASK_EDGE_REGRESSION,
+                                BUILTIN_TASK_LINK_PREDICTION)
+
+def main(config_args):
+    """ main function
+    """
+    config = GSConfig(config_args)
+    config.verify_arguments(True)
+
+    gs.initialize(ip_config=config.ip_config, backend=config.backend,
+                  use_wholegraph=use_wholegraph(config.part_config))
+    rt_profiler.init(config.profile_path, rank=gs.get_rank())
+    sys_tracker.init(config.verbose, rank=gs.get_rank())
+    device = setup_device(config.local_rank)
+    tracker = gs.create_builtin_task_tracker(config)
+    if gs.get_rank() == 0:
+        tracker.log_params(config.__dict__)
+
+    if config.task_type == BUILTIN_TASK_LINK_PREDICTION:
+        train_data = GSgnnLPTrainData(config.graph_name,
+                                      config.part_config,
+                                      train_etypes=config.train_etype,
+                                      eval_etypes=config.eval_etype,
+                                      node_feat_field=config.node_feat_name,
+                                      pos_graph_feat_field=config.lp_edge_weight_for_loss)
+    elif config.task_type == BUILTIN_TASK_NODE_REGRESSION or BUILTIN_TASK_NODE_CLASSIFICATION:
+        train_data = GSgnnNodeTrainData(config.graph_name,
+                                        config.part_config,
+                                        train_ntypes=config.target_ntype,
+                                        eval_ntypes=config.eval_target_ntype,
+                                        node_feat_field=config.node_feat_name,
+                                        label_field=config.label_field)
+    elif config.task_type == BUILTIN_TASK_EDGE_CLASSIFICATION or BUILTIN_TASK_EDGE_REGRESSION:
+        train_data = GSgnnEdgeTrainData(config.graph_name,
+                                        config.part_config,
+                                        train_etypes=config.target_etype,
+                                        node_feat_field=config.node_feat_name,
+                                        label_field=config.label_field,
+                                        decoder_edge_feat=config.decoder_edge_feat)
+    else:
+        raise TypeError("Not supported for task type: ", config.task_type)
+
+    # assert the setting for the graphstorm embedding generation.
+    assert config.save_embed_path is not None, \
+        "save embeded path cannot be none for gs_gen_embeddings"
+    assert config.restore_model_path is not None, \
+        "restore model path cannot be none for gs_gen_embeddings"
+
+    if config.task_type == BUILTIN_TASK_LINK_PREDICTION or not config.task_type:
+        model = gs.create_builtin_lp_gnn_model(train_data.g, config, train_task=False)
+    elif config.task_type == BUILTIN_TASK_NODE_REGRESSION or BUILTIN_TASK_NODE_CLASSIFICATION:
+        model = gs.create_builtin_node_gnn_model(train_data.g, config, train_task=False)
+    elif config.task_type == BUILTIN_TASK_EDGE_CLASSIFICATION or BUILTIN_TASK_EDGE_REGRESSION:
+        model = gs.create_builtin_edge_gnn_model(train_data.g, config, train_task=False)
+
+    model_path = config.restore_model_path
+    # TODO(zhengda) the model path has to be in a shared filesystem.
+    model.restore_model(model_path)
+    # Preparing input layer for training or inference.
+    # The input layer can pre-compute node features in the preparing step if needed.
+    # For example pre-compute all BERT embeddings
+    model.prepare_input_encoder(train_data)
+    # TODO(zhengda) we may not want to only use training edges to generate GNN embeddings.
+    embeddings = do_full_graph_inference(model, train_data, fanout=config.eval_fanout,
+                                         task_tracker=tracker)
+    save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
+                     gs.get_world_size(),
+                     device=device,
+                     node_id_mapping_file=config.node_id_mapping_file,
+                     save_embed_format=config.save_embed_format)
+
+
+def generate_parser():
+    """ Generate an argument parser
+    """
+    parser = get_argument_parser()
+    return parser
+
+
+if __name__ == '__main__':
+    arg_parser = generate_parser()
+
+    args = arg_parser.parse_args()
+    main(args)
diff --git a/python/graphstorm/run/gsgnn_ep/gsgnn_ep.py b/python/graphstorm/run/gsgnn_ep/gsgnn_ep.py
@@ -156,7 +156,8 @@ def main(config_args):
         save_embeddings(config.save_embed_path, embs, gs.get_rank(),
                         gs.get_world_size(),
                         device=device,
-                        node_id_mapping_file=config.node_id_mapping_file)
+                        node_id_mapping_file=config.node_id_mapping_file,
+                        save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser

diff --git a/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py b/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py
@@ -202,7 +202,8 @@ def main(config_args):
         save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
                         gs.get_world_size(),
                         device=device,
-                        node_id_mapping_file=config.node_id_mapping_file)
+                        node_id_mapping_file=config.node_id_mapping_file,
+                        save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser

diff --git a/python/graphstorm/run/gsgnn_np/gsgnn_np.py b/python/graphstorm/run/gsgnn_np/gsgnn_np.py
@@ -156,7 +156,8 @@ def main(config_args):
         save_embeddings(config.save_embed_path, embeddings, gs.get_rank(),
                         gs.get_world_size(),
                         device=device,
-                        node_id_mapping_file=config.node_id_mapping_file)
+                        node_id_mapping_file=config.node_id_mapping_file,
+                        save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser

diff --git a/python/graphstorm/run/launch.py b/python/graphstorm/run/launch.py
@@ -908,7 +908,7 @@ def check_input_arguments(args):
     ), "--num-servers must be a positive number."
     assert (
         args.part_config is not None
-    ), "A user has to specify a partition configuration file with --part-onfig."
+    ), "A user has to specify a partition configuration file with --part-config."
     assert (
         args.ip_config is not None
     ), "A user has to specify an IP configuration file with --ip-config."

diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh
@@ -135,6 +135,15 @@ python3 check_infer.py --train_embout /data/gsgnn_ec/emb/ --infer_embout /data/g
 
 error_and_exit $?
 
+echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on edge classification"
+python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --restore-model-path /data/gsgnn_ec/epoch-$best_epoch/ --save-embed-path /data/gsgnn_ec/save-emb/ --logging-file /tmp/train_log.txt --logging-level debug
+
+error_and_exit $?
+
+python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_ec/emb/ --infer_embout /data/gsgnn_ec/save-emb/
+
+error_and_exit $?
+
 echo "**************dataset: Generated multilabel MovieLens EC, do inference on saved model without test_mask"
 python3 -m graphstorm.run.gs_edge_classification --inference --workspace $GS_HOME/inference_scripts/ep_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_no_test_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec_infer.yaml  --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer false --save-embed-path /data/gsgnn_ec/infer-emb/ --restore-model-path /data/gsgnn_ec/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_ec/prediction/ --no-validation true
 

diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh
@@ -192,6 +192,15 @@ then
 fi
 rm -fr /data/gsgnn_lp_ml_dot/infer-emb/
 
+echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on link prediction"
+python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-node-embeddings true --restore-model-path /data/gsgnn_lp_ml_distmult/epoch-$best_epoch_dot/ --save-embed-path /data/gsgnn_lp_ml_distmult/save-emb/ --lp-decoder-type distmult --train-etype user,rating,movie movie,rating-rev,user --logging-file /tmp/train_log.txt --logging-level debug
+
+error_and_exit $?
+
+python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_lp_ml_dot/emb/ --infer_embout /data/gsgnn_lp_ml_dot/save-emb/
+
+error_and_exit $?
+
 echo "**************dataset: Movielens, do mini-batch inference on saved model, decoder: dot"
 python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --use-node-embeddings true --eval-batch-size 1024 --save-embed-path /data/gsgnn_lp_ml_dot/infer-emb/ --restore-model-path /data/gsgnn_lp_ml_dot/epoch-$best_epoch_dot/ --use-mini-batch-infer true --logging-file /tmp/log.txt
 

diff --git a/tests/end2end-tests/graphstorm-nc/mgpu_test.sh b/tests/end2end-tests/graphstorm-nc/mgpu_test.sh
@@ -137,6 +137,15 @@ python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_n
 
 error_and_exit $?
 
+echo "**************dataset: Movielens, use gen_embeddings to generate embeddings on node classification"
+python3 -m graphstorm.run.gs_gen_embedding --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_val_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --restore-model-path /data/gsgnn_nc_ml/epoch-$best_epoch/ --save-embed-path /data/gsgnn_nc_ml/save-emb --logging-file /tmp/train_log.txt --logging-level debug
+
+error_and_exit $?
+
+python3 $GS_HOME/tests/end2end-tests/check_infer.py --train_embout /data/gsgnn_nc_ml/emb/ --infer_embout /data/gsgnn_nc_ml/save-emb/
+
+error_and_exit $?
+
 echo "**************dataset: Movielens, do inference on saved model with mini-batch-infer without test mask"
 python3 -m graphstorm.run.gs_node_classification --inference --workspace $GS_HOME/inference_scripts/np_infer/ --num-trainers $NUM_INFERs --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_train_notest_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc_infer.yaml --use-mini-batch-infer true  --save-embed-path /data/gsgnn_nc_ml/mini-infer-emb/ --restore-model-path /data/gsgnn_nc_ml/epoch-$best_epoch/ --save-prediction-path /data/gsgnn_nc_ml/prediction/ --no-validation true