From 573e72e0e144a2d5dd553d59012e53e6def88db8 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 15 Nov 2023 18:46:46 -0800 Subject: [PATCH 1/8] Add debug info in SageMaker download graph. --- python/graphstorm/sagemaker/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index 94e59d49bc..e9d66ca9f9 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -270,16 +270,18 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, graph_config), graph_path, sagemaker_session=sagemaker_session) try: + logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}") S3Downloader.download(os.path.join(graph_data_s3, graph_part), graph_part_path, sagemaker_session=sagemaker_session) - except Exception: # pylint: disable=broad-except - print(f"Can not download graph_data from {graph_data_s3}.") - raise RuntimeError(f"Can not download graph_data from {graph_data_s3}.") + except Exception as e: # pylint: disable=broad-except + print(f"Can not download graph_data from {graph_data_s3}, {str(e)}.") + raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.") node_id_mapping = "node_mapping.pt" # Try to download node id mapping file if any try: + logging.info(f"Download graph id mapping from {os.path.join(graph_data_s3, node_id_mapping)} to {graph_path}") S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping), graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except @@ -290,6 +292,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses id_map_files = [file for file in files if file.endswith("id_remap.parquet")] for file in id_map_files: try: + logging.info(f"Download graph remap from {file} to {graph_path}") S3Downloader.download(file, graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except From e533004286978d1afbfbbb0a093f3e9c5828a67d Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 15 Nov 2023 21:44:07 -0800 Subject: [PATCH 2/8] Update --- python/graphstorm/sagemaker/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index e9d66ca9f9..b8ebb16041 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -274,7 +274,8 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, graph_part), graph_part_path, sagemaker_session=sagemaker_session) except Exception as e: # pylint: disable=broad-except - print(f"Can not download graph_data from {graph_data_s3}, {str(e)}.") + logging.error("Can not download graph_data from %s, %s.", + graph_data_s3, str(e)) raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.") node_id_mapping = "node_mapping.pt" @@ -285,20 +286,21 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping), graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except - print("node id mapping file does not exist") + logging.warning("node id mapping file does not exist") # Try to get GraphStorm ID to Original ID remaping files if any files = S3Downloader.list(graph_data_s3, sagemaker_session=sagemaker_session) id_map_files = [file for file in files if file.endswith("id_remap.parquet")] for file in id_map_files: try: - logging.info(f"Download graph remap from {file} to {graph_path}") + logging.info("Download graph remap from %s to %s", + file, graph_path) S3Downloader.download(file, graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except - print(f"node id remap file {file} does not exist") + logging.warning("node id remap file %s does not exist", file) - print(f"Finish download graph data from {graph_data_s3}") + logging.info("Finish download graph data from %s", graph_data_s3) return os.path.join(graph_path, graph_config) From 0b1aef7c1670e2b5adc1156a96bf78bccc6f8371 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 15 Nov 2023 21:53:38 -0800 Subject: [PATCH 3/8] Update --- python/graphstorm/model/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index 48b509544c..dceda85b2e 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -1012,7 +1012,8 @@ def _load_id_mapping(self, g, ntype, id_mappings): # Shuffled node ID: 0, 1, 2 id_mapping = id_mappings[ntype] if isinstance(id_mappings, dict) else id_mappings assert id_mapping.shape[0] == num_nodes, \ - "id mapping should have the same size of num_nodes" + "Id mapping should have the same size of num_nodes." \ + f"Expect {id_mapping.shape[0]}, but get {num_nodes}" # Save ID mapping into dist tensor id_mapping_info[th.arange(num_nodes)] = id_mapping barrier() From 5544e9248a1a4480662990b722c2f4dfcfa5b7f8 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 15 Nov 2023 21:58:56 -0800 Subject: [PATCH 4/8] Update --- python/graphstorm/sagemaker/utils.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index b8ebb16041..1b2ce73248 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -183,8 +183,8 @@ def download_yaml_config(yaml_s3, local_path, sagemaker_session): try: S3Downloader.download(yaml_s3, local_path, sagemaker_session=sagemaker_session) - except Exception: # pylint: disable=broad-except - raise RuntimeError(f"Fail to download yaml file {yaml_s3}") + except Exception as err: # pylint: disable=broad-except + raise RuntimeError(f"Fail to download yaml file {yaml_s3}: {err}") return yaml_path @@ -206,9 +206,10 @@ def download_model(model_artifact_s3, model_path, sagemaker_session): try: S3Downloader.download(model_artifact_s3, model_path, sagemaker_session=sagemaker_session) - except Exception: # pylint: disable=broad-except + except Exception as err: # pylint: disable=broad-except raise RuntimeError("Can not download saved model artifact" \ - f"model.bin from {model_artifact_s3}.") + f"model.bin from {model_artifact_s3}." \ + f"{err}") def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_session): """ download graph data @@ -273,10 +274,10 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}") S3Downloader.download(os.path.join(graph_data_s3, graph_part), graph_part_path, sagemaker_session=sagemaker_session) - except Exception as e: # pylint: disable=broad-except + except Exception as err: # pylint: disable=broad-except logging.error("Can not download graph_data from %s, %s.", - graph_data_s3, str(e)) - raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.") + graph_data_s3, str(err)) + raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {err}.") node_id_mapping = "node_mapping.pt" @@ -319,9 +320,9 @@ def upload_data_to_s3(s3_path, data_path, sagemaker_session): try: ret = S3Uploader.upload(data_path, s3_path, sagemaker_session=sagemaker_session) - except Exception: # pylint: disable=broad-except - print(f"Can not upload data into {s3_path}") - raise RuntimeError(f"Can not upload data into {s3_path}") + except Exception as err: # pylint: disable=broad-except + logging.error("Can not upload data into %s", s3_path) + raise RuntimeError(f"Can not upload data into {s3_path}. {err}") return ret def upload_model_artifacts(model_s3_path, model_path, sagemaker_session): @@ -340,7 +341,7 @@ def upload_model_artifacts(model_s3_path, model_path, sagemaker_session): sagemaker_session: sagemaker.session.Session sagemaker_session to run download """ - print(f"Upload model artifacts to {model_s3_path}") + logging.info("Upload model artifacts to %s", model_s3_path) # Rank0 will upload both dense models and learnable embeddings owned by Rank0. # Other ranks will only upload learnable embeddings owned by themselves. return upload_data_to_s3(model_s3_path, model_path, sagemaker_session) From 69ad565934d6edc5d200a06f262b362ff1b21af8 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 27 Nov 2023 17:00:03 -0800 Subject: [PATCH 5/8] Using logging instead of print in SageMaker lib --- .../graphstorm/sagemaker/sagemaker_infer.py | 29 +++++++++++------ .../graphstorm/sagemaker/sagemaker_train.py | 32 +++++++++++-------- sagemaker/launch/launch_infer.py | 9 ++++-- sagemaker/launch/launch_train.py | 6 +++- sagemaker/run/infer_entry.py | 2 ++ sagemaker/run/train_entry.py | 2 ++ 6 files changed, 54 insertions(+), 26 deletions(-) diff --git a/python/graphstorm/sagemaker/sagemaker_infer.py b/python/graphstorm/sagemaker/sagemaker_infer.py index ca8bd4dc9b..8aad18b9d3 100644 --- a/python/graphstorm/sagemaker/sagemaker_infer.py +++ b/python/graphstorm/sagemaker/sagemaker_infer.py @@ -19,6 +19,7 @@ """ # Install additional requirements import os +import logging import socket import time import json @@ -117,13 +118,14 @@ def launch_infer_task(task_type, num_gpus, graph_config, launch_cmd += ["--cf", f"{yaml_path}", "--restore-model-path", f"{load_model_path}", "--save-embed-path", f"{save_emb_path}"] + extra_args + logging.debug("Launch inference %s", launch_cmd) def run(launch_cmd, state_q): try: subprocess.check_call(launch_cmd, shell=False) state_q.put(0) except subprocess.CalledProcessError as err: - print(f"Called process error {err}") + logging.error("Called process error %s", err) state_q.put(err.returncode) except Exception: # pylint: disable=broad-except state_q.put(-1) @@ -174,8 +176,8 @@ def run_infer(args, unknownargs): # start the ssh server subprocess.run(["service", "ssh", "start"], check=True) - print(f"Know args {args}") - print(f"Unknow args {unknownargs}") + logging.info("Know args %s", args) + logging.info("Unknow args %s", unknownargs) train_env = json.loads(args.sm_dist_env) hosts = train_env['hosts'] @@ -184,9 +186,15 @@ def run_infer(args, unknownargs): os.environ['WORLD_SIZE'] = str(world_size) host_rank = hosts.index(current_host) + # NOTE: Ensure no logging has been done before setting logging configuration + logging.basicConfig( + level=getattr(logging, args.log_level.upper(), None), + format=f'{current_host}: %(asctime)s - %(levelname)s - %(message)s', + force=True) + try: for host in hosts: - print(f"The {host} IP is {socket.gethostbyname(host)}") + logging.info("The %s IP is %s", host, {socket.gethostbyname(host)}) except: raise RuntimeError(f"Can not get host name of {hosts}") @@ -210,9 +218,9 @@ def run_infer(args, unknownargs): sock.connect((master_addr, 12345)) break except: # pylint: disable=bare-except - print(f"Try to connect {master_addr}") + logging.info("Try to connect %s", master_addr) time.sleep(10) - print("Connected") + logging.info("Connected") # write ip list info into disk ip_list_path = os.path.join(data_path, 'ip_list.txt') @@ -247,7 +255,8 @@ def run_infer(args, unknownargs): # Download Saved model download_model(model_artifact_s3, model_path, sagemaker_session) - print(f"{model_path} {os.listdir(model_path)}") + logging.info("Successfully download the model into %s.\n The model has: %s.", + model_path, os.listdir(model_path)) err_code = 0 if host_rank == 0: @@ -281,7 +290,7 @@ def run_infer(args, unknownargs): err_code = -1 terminate_workers(client_list, world_size, task_end) - print("Master End") + logging.info("Master End") if err_code != -1: upload_embs(output_emb_s3, emb_path, sagemaker_session) # clean embs, so SageMaker does not need to upload embs again @@ -295,12 +304,12 @@ def run_infer(args, unknownargs): upload_embs(output_emb_s3, emb_path, sagemaker_session) # clean embs, so SageMaker does not need to upload embs again remove_embs(emb_path) - print("Worker End") + logging.info("Worker End") sock.close() if err_code != 0: # Report an error - print("Task failed") + logging.info("Task failed") sys.exit(-1) if args.output_prediction_s3 is not None: diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py index 6c6a1cf911..fc529b5060 100644 --- a/python/graphstorm/sagemaker/sagemaker_train.py +++ b/python/graphstorm/sagemaker/sagemaker_train.py @@ -17,6 +17,7 @@ """ # Install additional requirements import os +import logging import socket import time import json @@ -108,15 +109,14 @@ def launch_train_task(task_type, num_gpus, graph_config, launch_cmd += ["--restore-model-path", f"{restore_model_path}"] \ if restore_model_path is not None else [] launch_cmd += extra_args - - print(launch_cmd) + logging.debug("Launch training %s", launch_cmd) def run(launch_cmd, state_q): try: subprocess.check_call(launch_cmd, shell=False) state_q.put(0) except subprocess.CalledProcessError as err: - print(f"Called process error {err}") + logging.error("Called process error %s", err) state_q.put(err.returncode) except Exception: # pylint: disable=broad-except state_q.put(-1) @@ -167,8 +167,8 @@ def run_train(args, unknownargs): # start the ssh server subprocess.run(["service", "ssh", "start"], check=True) - print(f"Know args {args}") - print(f"Unknow args {unknownargs}") + logging.info("Know args %s", args) + logging.info("Unknow args %s", unknownargs) save_model_path = os.path.join(output_path, "model_checkpoint") @@ -179,9 +179,15 @@ def run_train(args, unknownargs): os.environ['WORLD_SIZE'] = str(world_size) host_rank = hosts.index(current_host) + # NOTE: Ensure no logging has been done before setting logging configuration + logging.basicConfig( + level=getattr(logging, args.log_level.upper(), None), + format=f'{current_host}: %(asctime)s - %(levelname)s - %(message)s', + force=True) + try: for host in hosts: - print(f"The {host} IP is {socket.gethostbyname(host)}") + logging.info("The %s IP is %s", host, socket.gethostbyname(host)) except: raise RuntimeError(f"Can not get host name of {hosts}") @@ -205,9 +211,9 @@ def run_train(args, unknownargs): sock.connect((master_addr, 12345)) break except: # pylint: disable=bare-except - print(f"Try to connect {master_addr}") + logging.info("Try to connect %s", master_addr) time.sleep(10) - print("Connected") + logging.info("Connected") # write ip list info into disk ip_list_path = os.path.join(data_path, 'ip_list.txt') @@ -232,8 +238,8 @@ def run_train(args, unknownargs): if model_checkpoint_s3 is not None: # Download Saved model checkpoint to resume download_model(model_checkpoint_s3, restore_model_path, sagemaker_session) - print(f"{restore_model_path} {os.listdir(restore_model_path)}") - + logging.info("Successfully download the model into %s.\n The model has: %s.", + restore_model_path, os.listdir(restore_model_path)) err_code = 0 if host_rank == 0: @@ -265,18 +271,18 @@ def run_train(args, unknownargs): print(e) err_code = -1 terminate_workers(client_list, world_size, task_end) - print("Master End") + logging.info("Master End") else: barrier(sock) # Block util training finished # Listen to end command wait_for_exit(sock) - print("Worker End") + logging.info("Worker End") sock.close() if err_code != 0: # Report an error - print("Task failed") + logging.error("Task failed") sys.exit(-1) # If there are saved models diff --git a/sagemaker/launch/launch_infer.py b/sagemaker/launch/launch_infer.py index a6186d8180..725d23da57 100644 --- a/sagemaker/launch/launch_infer.py +++ b/sagemaker/launch/launch_infer.py @@ -55,6 +55,7 @@ def run_job(input_args, image, unknownargs): output_predict_s3_path = input_args.output_prediction_s3 # S3 location to save prediction results model_artifact_s3 = input_args.model_artifact_s3 # S3 location of saved model artifacts output_chunk_size = input_args.output_chunk_size # Number of rows per chunked prediction result or node embedding file. + log_level = input_args.log_level # SageMaker runner logging level boto_session = boto3.session.Session(region_name=region) sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region) @@ -76,7 +77,8 @@ def run_job(input_args, image, unknownargs): "infer-yaml-s3": infer_yaml_s3, "output-emb-s3": output_emb_s3_path, "model-artifact-s3": model_artifact_s3, - "output-chunk-size": output_chunk_size} + "output-chunk-size": output_chunk_size, + "log-level": log_level} else: params = {"task-type": task_type, "graph-name": graph_name, @@ -85,7 +87,8 @@ def run_job(input_args, image, unknownargs): "output-emb-s3": output_emb_s3_path, "output-prediction-s3": output_predict_s3_path, "model-artifact-s3": model_artifact_s3, - "output-chunk-size": output_chunk_size} + "output-chunk-size": output_chunk_size, + "log-level": log_level} # We must handle cases like # --target-etype query,clicks,asin query,search,asin # --feat-name ntype0:feat0 ntype1:feat1 @@ -170,6 +173,8 @@ def get_inference_parser(): help="Relative path to the trained model under ." "There can be multiple model checkpoints under" ", this argument is used to choose one.") + inference_args.add_argument('--log-level', default='INFO', + type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL']) return parser diff --git a/sagemaker/launch/launch_train.py b/sagemaker/launch/launch_train.py index 9d1109ab5c..430f04090b 100644 --- a/sagemaker/launch/launch_train.py +++ b/sagemaker/launch/launch_train.py @@ -52,6 +52,7 @@ def run_job(input_args, image, unknowargs): model_artifact_s3 = input_args.model_artifact_s3 # Where to store model artifacts model_checkpoint_to_load = input_args.model_checkpoint_to_load # S3 location of a saved model. custom_script = input_args.custom_script # custom_script if any + log_level = input_args.log_level # SageMaker runner logging level boto_session = boto3.session.Session(region_name=region) sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region) @@ -66,7 +67,8 @@ def run_job(input_args, image, unknowargs): "graph-name": graph_name, "graph-data-s3": graph_data_s3, "train-yaml-s3": train_yaml_s3, - "model-artifact-s3": model_artifact_s3} + "model-artifact-s3": model_artifact_s3, + "log-level": log_level} if custom_script is not None: params["custom-script"] = custom_script if model_checkpoint_to_load is not None: @@ -143,6 +145,8 @@ def get_train_parser(): training_args.add_argument("--custom-script", type=str, default=None, help="Custom training script provided by a customer to run customer training logic. \ Please provide the path of the script within the docker image") + training_args.add_argument('--log-level', default='INFO', + type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL']) return parser diff --git a/sagemaker/run/infer_entry.py b/sagemaker/run/infer_entry.py index fa8cb181a8..98e0ac5352 100644 --- a/sagemaker/run/infer_entry.py +++ b/sagemaker/run/infer_entry.py @@ -52,6 +52,8 @@ def parse_train_args(): Please provide the path of the script within the docker image") parser.add_argument("--output-chunk-size", type=int, default=100000, help="Number of rows per chunked prediction result or node embedding file.") + parser.add_argument('--log-level', default='INFO', + type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL']) # following arguments are required to launch a distributed GraphStorm training task parser.add_argument('--data-path', type=str, default=os.environ['SM_CHANNEL_TRAIN']) diff --git a/sagemaker/run/train_entry.py b/sagemaker/run/train_entry.py index a6af59f7af..ee1930fa5f 100644 --- a/sagemaker/run/train_entry.py +++ b/sagemaker/run/train_entry.py @@ -45,6 +45,8 @@ def parse_train_args(): parser.add_argument("--custom-script", type=str, default=None, help="Custom training script provided by a customer to run customer training logic. \ Please provide the path of the script within the docker image") + parser.add_argument('--log-level', default='INFO', + type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL']) # following arguments are required to launch a distributed GraphStorm training task parser.add_argument('--data-path', type=str, default=os.environ['SM_CHANNEL_TRAIN']) From 1ace099edd8300e3681736757efb90976a24b618 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 27 Nov 2023 18:36:03 -0800 Subject: [PATCH 6/8] Update --- python/graphstorm/sagemaker/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index 1b2ce73248..8d9f88abde 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -271,7 +271,9 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, graph_config), graph_path, sagemaker_session=sagemaker_session) try: - logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}") + logging.info(f"Download graph from %s to %s", + os.path.join(graph_data_s3, graph_part), + graph_part_path) S3Downloader.download(os.path.join(graph_data_s3, graph_part), graph_part_path, sagemaker_session=sagemaker_session) except Exception as err: # pylint: disable=broad-except @@ -283,7 +285,9 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses # Try to download node id mapping file if any try: - logging.info(f"Download graph id mapping from {os.path.join(graph_data_s3, node_id_mapping)} to {graph_path}") + logging.info(f"Download graph id mapping from %s to %s", + os.path.join(graph_data_s3, node_id_mapping), + graph_path) S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping), graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except From e04685ccd9af992a52952c0f9abb2eb3da9ff251 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 27 Nov 2023 22:09:27 -0800 Subject: [PATCH 7/8] Update --- python/graphstorm/sagemaker/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index 8d9f88abde..5bda551f58 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -271,7 +271,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, graph_config), graph_path, sagemaker_session=sagemaker_session) try: - logging.info(f"Download graph from %s to %s", + logging.info("Download graph from %s to %s", os.path.join(graph_data_s3, graph_part), graph_part_path) S3Downloader.download(os.path.join(graph_data_s3, graph_part), @@ -285,7 +285,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses # Try to download node id mapping file if any try: - logging.info(f"Download graph id mapping from %s to %s", + logging.info("Download graph id mapping from %s to %s", os.path.join(graph_data_s3, node_id_mapping), graph_path) S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping), From 8f560a219c8113d930d6878e5a3afea36c35a7de Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Tue, 28 Nov 2023 14:12:03 -0800 Subject: [PATCH 8/8] Update --- python/graphstorm/sagemaker/sagemaker_infer.py | 8 ++++---- python/graphstorm/sagemaker/sagemaker_train.py | 6 +++--- python/graphstorm/sagemaker/utils.py | 9 ++++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/python/graphstorm/sagemaker/sagemaker_infer.py b/python/graphstorm/sagemaker/sagemaker_infer.py index 8aad18b9d3..89ca5921a3 100644 --- a/python/graphstorm/sagemaker/sagemaker_infer.py +++ b/python/graphstorm/sagemaker/sagemaker_infer.py @@ -176,8 +176,8 @@ def run_infer(args, unknownargs): # start the ssh server subprocess.run(["service", "ssh", "start"], check=True) - logging.info("Know args %s", args) - logging.info("Unknow args %s", unknownargs) + logging.info("Known args %s", args) + logging.info("Unknown args %s", unknownargs) train_env = json.loads(args.sm_dist_env) hosts = train_env['hosts'] @@ -255,7 +255,7 @@ def run_infer(args, unknownargs): # Download Saved model download_model(model_artifact_s3, model_path, sagemaker_session) - logging.info("Successfully download the model into %s.\n The model has: %s.", + logging.info("Successfully downloaded the model into %s.\n The model files are: %s.", model_path, os.listdir(model_path)) err_code = 0 @@ -309,7 +309,7 @@ def run_infer(args, unknownargs): sock.close() if err_code != 0: # Report an error - logging.info("Task failed") + logging.error("Task failed") sys.exit(-1) if args.output_prediction_s3 is not None: diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py index fc529b5060..cf36a79aa5 100644 --- a/python/graphstorm/sagemaker/sagemaker_train.py +++ b/python/graphstorm/sagemaker/sagemaker_train.py @@ -167,8 +167,8 @@ def run_train(args, unknownargs): # start the ssh server subprocess.run(["service", "ssh", "start"], check=True) - logging.info("Know args %s", args) - logging.info("Unknow args %s", unknownargs) + logging.info("Known args %s", args) + logging.info("Unknown args %s", unknownargs) save_model_path = os.path.join(output_path, "model_checkpoint") @@ -238,7 +238,7 @@ def run_train(args, unknownargs): if model_checkpoint_s3 is not None: # Download Saved model checkpoint to resume download_model(model_checkpoint_s3, restore_model_path, sagemaker_session) - logging.info("Successfully download the model into %s.\n The model has: %s.", + logging.info("Successfully downloaded the model into %s.\n The model files are: %s.", restore_model_path, os.listdir(restore_model_path)) err_code = 0 diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py index 5bda551f58..f80f7347e8 100644 --- a/python/graphstorm/sagemaker/utils.py +++ b/python/graphstorm/sagemaker/utils.py @@ -291,7 +291,10 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping), graph_path, sagemaker_session=sagemaker_session) except Exception: # pylint: disable=broad-except - logging.warning("node id mapping file does not exist") + logging.warning("Node id mapping file does not exist." + "If you are running GraphStorm on a graph with " + "more than 1 partition, it is recommended to provide " + "the node id mapping file created by gconstruct or gsprocessing.") # Try to get GraphStorm ID to Original ID remaping files if any files = S3Downloader.list(graph_data_s3, sagemaker_session=sagemaker_session) @@ -305,7 +308,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses except Exception: # pylint: disable=broad-except logging.warning("node id remap file %s does not exist", file) - logging.info("Finish download graph data from %s", graph_data_s3) + logging.info("Finished downloading graph data from %s", graph_data_s3) return os.path.join(graph_path, graph_config) @@ -345,7 +348,7 @@ def upload_model_artifacts(model_s3_path, model_path, sagemaker_session): sagemaker_session: sagemaker.session.Session sagemaker_session to run download """ - logging.info("Upload model artifacts to %s", model_s3_path) + logging.info("Uploading model artifacts to %s", model_s3_path) # Rank0 will upload both dense models and learnable embeddings owned by Rank0. # Other ranks will only upload learnable embeddings owned by themselves. return upload_data_to_s3(model_s3_path, model_path, sagemaker_session)