From 573e72e0e144a2d5dd553d59012e53e6def88db8 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 15 Nov 2023 18:46:46 -0800
Subject: [PATCH 1/8] Add debug info in SageMaker download graph.

---
 python/graphstorm/sagemaker/utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index 94e59d49bc..e9d66ca9f9 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -270,16 +270,18 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
     S3Downloader.download(os.path.join(graph_data_s3, graph_config),
             graph_path, sagemaker_session=sagemaker_session)
     try:
+        logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}")
         S3Downloader.download(os.path.join(graph_data_s3, graph_part),
             graph_part_path, sagemaker_session=sagemaker_session)
-    except Exception: # pylint: disable=broad-except
-        print(f"Can not download graph_data from {graph_data_s3}.")
-        raise RuntimeError(f"Can not download graph_data from {graph_data_s3}.")
+    except Exception as e: # pylint: disable=broad-except
+        print(f"Can not download graph_data from {graph_data_s3}, {str(e)}.")
+        raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.")
 
     node_id_mapping = "node_mapping.pt"
 
     # Try to download node id mapping file if any
     try:
+        logging.info(f"Download graph id mapping from {os.path.join(graph_data_s3, node_id_mapping)} to {graph_path}")
         S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping),
             graph_path, sagemaker_session=sagemaker_session)
     except Exception: # pylint: disable=broad-except
@@ -290,6 +292,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
     id_map_files = [file for file in files if file.endswith("id_remap.parquet")]
     for file in id_map_files:
         try:
+            logging.info(f"Download graph remap from {file} to {graph_path}")
             S3Downloader.download(file, graph_path,
                                   sagemaker_session=sagemaker_session)
         except Exception: # pylint: disable=broad-except

From e533004286978d1afbfbbb0a093f3e9c5828a67d Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 15 Nov 2023 21:44:07 -0800
Subject: [PATCH 2/8] Update

---
 python/graphstorm/sagemaker/utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index e9d66ca9f9..b8ebb16041 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -274,7 +274,8 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
         S3Downloader.download(os.path.join(graph_data_s3, graph_part),
             graph_part_path, sagemaker_session=sagemaker_session)
     except Exception as e: # pylint: disable=broad-except
-        print(f"Can not download graph_data from {graph_data_s3}, {str(e)}.")
+        logging.error("Can not download graph_data from %s, %s.",
+                      graph_data_s3, str(e))
         raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.")
 
     node_id_mapping = "node_mapping.pt"
@@ -285,20 +286,21 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
         S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping),
             graph_path, sagemaker_session=sagemaker_session)
     except Exception: # pylint: disable=broad-except
-        print("node id mapping file does not exist")
+        logging.warning("node id mapping file does not exist")
 
     # Try to get GraphStorm ID to Original ID remaping files if any
     files = S3Downloader.list(graph_data_s3, sagemaker_session=sagemaker_session)
     id_map_files = [file for file in files if file.endswith("id_remap.parquet")]
     for file in id_map_files:
         try:
-            logging.info(f"Download graph remap from {file} to {graph_path}")
+            logging.info("Download graph remap from %s to %s",
+                         file, graph_path)
             S3Downloader.download(file, graph_path,
                                   sagemaker_session=sagemaker_session)
         except Exception: # pylint: disable=broad-except
-            print(f"node id remap file {file} does not exist")
+            logging.warning("node id remap file %s does not exist", file)
 
-    print(f"Finish download graph data from {graph_data_s3}")
+    logging.info("Finish download graph data from %s", graph_data_s3)
     return os.path.join(graph_path, graph_config)
 
 

From 0b1aef7c1670e2b5adc1156a96bf78bccc6f8371 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 15 Nov 2023 21:53:38 -0800
Subject: [PATCH 3/8] Update

---
 python/graphstorm/model/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index 48b509544c..dceda85b2e 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -1012,7 +1012,8 @@ def _load_id_mapping(self, g, ntype, id_mappings):
             # Shuffled node ID: 0, 1, 2
             id_mapping = id_mappings[ntype] if isinstance(id_mappings, dict) else id_mappings
             assert id_mapping.shape[0] == num_nodes, \
-                "id mapping should have the same size of num_nodes"
+                "Id mapping should have the same size of num_nodes." \
+                f"Expect {id_mapping.shape[0]}, but get {num_nodes}"
             # Save ID mapping into dist tensor
             id_mapping_info[th.arange(num_nodes)] = id_mapping
         barrier()

From 5544e9248a1a4480662990b722c2f4dfcfa5b7f8 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 15 Nov 2023 21:58:56 -0800
Subject: [PATCH 4/8] Update

---
 python/graphstorm/sagemaker/utils.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index b8ebb16041..1b2ce73248 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -183,8 +183,8 @@ def download_yaml_config(yaml_s3, local_path, sagemaker_session):
     try:
         S3Downloader.download(yaml_s3, local_path,
             sagemaker_session=sagemaker_session)
-    except Exception: # pylint: disable=broad-except
-        raise RuntimeError(f"Fail to download yaml file {yaml_s3}")
+    except Exception as err: # pylint: disable=broad-except
+        raise RuntimeError(f"Fail to download yaml file {yaml_s3}: {err}")
 
     return yaml_path
 
@@ -206,9 +206,10 @@ def download_model(model_artifact_s3, model_path, sagemaker_session):
     try:
         S3Downloader.download(model_artifact_s3,
             model_path, sagemaker_session=sagemaker_session)
-    except Exception: # pylint: disable=broad-except
+    except Exception as err: # pylint: disable=broad-except
         raise RuntimeError("Can not download saved model artifact" \
-                           f"model.bin from {model_artifact_s3}.")
+                           f"model.bin from {model_artifact_s3}." \
+                           f"{err}")
 
 def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_session):
     """ download graph data
@@ -273,10 +274,10 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
         logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}")
         S3Downloader.download(os.path.join(graph_data_s3, graph_part),
             graph_part_path, sagemaker_session=sagemaker_session)
-    except Exception as e: # pylint: disable=broad-except
+    except Exception as err: # pylint: disable=broad-except
         logging.error("Can not download graph_data from %s, %s.",
-                      graph_data_s3, str(e))
-        raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {str(e)}.")
+                      graph_data_s3, str(err))
+        raise RuntimeError(f"Can not download graph_data from {graph_data_s3}, {err}.")
 
     node_id_mapping = "node_mapping.pt"
 
@@ -319,9 +320,9 @@ def upload_data_to_s3(s3_path, data_path, sagemaker_session):
     try:
         ret = S3Uploader.upload(data_path, s3_path,
             sagemaker_session=sagemaker_session)
-    except Exception: # pylint: disable=broad-except
-        print(f"Can not upload data into {s3_path}")
-        raise RuntimeError(f"Can not upload data into {s3_path}")
+    except Exception as err: # pylint: disable=broad-except
+        logging.error("Can not upload data into %s", s3_path)
+        raise RuntimeError(f"Can not upload data into {s3_path}. {err}")
     return ret
 
 def upload_model_artifacts(model_s3_path, model_path, sagemaker_session):
@@ -340,7 +341,7 @@ def upload_model_artifacts(model_s3_path, model_path, sagemaker_session):
     sagemaker_session: sagemaker.session.Session
         sagemaker_session to run download
     """
-    print(f"Upload model artifacts to {model_s3_path}")
+    logging.info("Upload model artifacts to %s", model_s3_path)
     # Rank0 will upload both dense models and learnable embeddings owned by Rank0.
     # Other ranks will only upload learnable embeddings owned by themselves.
     return upload_data_to_s3(model_s3_path, model_path, sagemaker_session)

From 69ad565934d6edc5d200a06f262b362ff1b21af8 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Mon, 27 Nov 2023 17:00:03 -0800
Subject: [PATCH 5/8] Using logging instead of print in SageMaker lib

---
 .../graphstorm/sagemaker/sagemaker_infer.py   | 29 +++++++++++------
 .../graphstorm/sagemaker/sagemaker_train.py   | 32 +++++++++++--------
 sagemaker/launch/launch_infer.py              |  9 ++++--
 sagemaker/launch/launch_train.py              |  6 +++-
 sagemaker/run/infer_entry.py                  |  2 ++
 sagemaker/run/train_entry.py                  |  2 ++
 6 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/python/graphstorm/sagemaker/sagemaker_infer.py b/python/graphstorm/sagemaker/sagemaker_infer.py
index ca8bd4dc9b..8aad18b9d3 100644
--- a/python/graphstorm/sagemaker/sagemaker_infer.py
+++ b/python/graphstorm/sagemaker/sagemaker_infer.py
@@ -19,6 +19,7 @@
 """
 # Install additional requirements
 import os
+import logging
 import socket
 import time
 import json
@@ -117,13 +118,14 @@ def launch_infer_task(task_type, num_gpus, graph_config,
     launch_cmd += ["--cf", f"{yaml_path}",
          "--restore-model-path", f"{load_model_path}",
          "--save-embed-path", f"{save_emb_path}"] + extra_args
+    logging.debug("Launch inference %s", launch_cmd)
 
     def run(launch_cmd, state_q):
         try:
             subprocess.check_call(launch_cmd, shell=False)
             state_q.put(0)
         except subprocess.CalledProcessError as err:
-            print(f"Called process error {err}")
+            logging.error("Called process error %s", err)
             state_q.put(err.returncode)
         except Exception: # pylint: disable=broad-except
             state_q.put(-1)
@@ -174,8 +176,8 @@ def run_infer(args, unknownargs):
     # start the ssh server
     subprocess.run(["service", "ssh", "start"], check=True)
 
-    print(f"Know args {args}")
-    print(f"Unknow args {unknownargs}")
+    logging.info("Know args %s", args)
+    logging.info("Unknow args %s", unknownargs)
 
     train_env = json.loads(args.sm_dist_env)
     hosts = train_env['hosts']
@@ -184,9 +186,15 @@ def run_infer(args, unknownargs):
     os.environ['WORLD_SIZE'] = str(world_size)
     host_rank = hosts.index(current_host)
 
+    # NOTE: Ensure no logging has been done before setting logging configuration
+    logging.basicConfig(
+        level=getattr(logging, args.log_level.upper(), None),
+        format=f'{current_host}: %(asctime)s - %(levelname)s - %(message)s',
+        force=True)
+
     try:
         for host in hosts:
-            print(f"The {host} IP is {socket.gethostbyname(host)}")
+            logging.info("The %s IP is %s", host, {socket.gethostbyname(host)})
     except:
         raise RuntimeError(f"Can not get host name of {hosts}")
 
@@ -210,9 +218,9 @@ def run_infer(args, unknownargs):
                 sock.connect((master_addr, 12345))
                 break
             except: # pylint: disable=bare-except
-                print(f"Try to connect {master_addr}")
+                logging.info("Try to connect %s", master_addr)
                 time.sleep(10)
-        print("Connected")
+        logging.info("Connected")
 
     # write ip list info into disk
     ip_list_path = os.path.join(data_path, 'ip_list.txt')
@@ -247,7 +255,8 @@ def run_infer(args, unknownargs):
 
     # Download Saved model
     download_model(model_artifact_s3, model_path, sagemaker_session)
-    print(f"{model_path} {os.listdir(model_path)}")
+    logging.info("Successfully download the model into %s.\n The model has: %s.",
+                 model_path, os.listdir(model_path))
 
     err_code = 0
     if host_rank == 0:
@@ -281,7 +290,7 @@ def run_infer(args, unknownargs):
             err_code = -1
 
         terminate_workers(client_list, world_size, task_end)
-        print("Master End")
+        logging.info("Master End")
         if err_code != -1:
             upload_embs(output_emb_s3, emb_path, sagemaker_session)
             # clean embs, so SageMaker does not need to upload embs again
@@ -295,12 +304,12 @@ def run_infer(args, unknownargs):
         upload_embs(output_emb_s3, emb_path, sagemaker_session)
         # clean embs, so SageMaker does not need to upload embs again
         remove_embs(emb_path)
-        print("Worker End")
+        logging.info("Worker End")
 
     sock.close()
     if err_code != 0:
         # Report an error
-        print("Task failed")
+        logging.info("Task failed")
         sys.exit(-1)
 
     if args.output_prediction_s3 is not None:
diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py
index 6c6a1cf911..fc529b5060 100644
--- a/python/graphstorm/sagemaker/sagemaker_train.py
+++ b/python/graphstorm/sagemaker/sagemaker_train.py
@@ -17,6 +17,7 @@
 """
 # Install additional requirements
 import os
+import logging
 import socket
 import time
 import json
@@ -108,15 +109,14 @@ def launch_train_task(task_type, num_gpus, graph_config,
     launch_cmd += ["--restore-model-path", f"{restore_model_path}"] \
             if restore_model_path is not None else []
     launch_cmd += extra_args
-
-    print(launch_cmd)
+    logging.debug("Launch training %s", launch_cmd)
 
     def run(launch_cmd, state_q):
         try:
             subprocess.check_call(launch_cmd, shell=False)
             state_q.put(0)
         except subprocess.CalledProcessError as err:
-            print(f"Called process error {err}")
+            logging.error("Called process error %s", err)
             state_q.put(err.returncode)
         except Exception: # pylint: disable=broad-except
             state_q.put(-1)
@@ -167,8 +167,8 @@ def run_train(args, unknownargs):
     # start the ssh server
     subprocess.run(["service", "ssh", "start"], check=True)
 
-    print(f"Know args {args}")
-    print(f"Unknow args {unknownargs}")
+    logging.info("Know args %s", args)
+    logging.info("Unknow args %s", unknownargs)
 
     save_model_path = os.path.join(output_path, "model_checkpoint")
 
@@ -179,9 +179,15 @@ def run_train(args, unknownargs):
     os.environ['WORLD_SIZE'] = str(world_size)
     host_rank = hosts.index(current_host)
 
+    # NOTE: Ensure no logging has been done before setting logging configuration
+    logging.basicConfig(
+        level=getattr(logging, args.log_level.upper(), None),
+        format=f'{current_host}: %(asctime)s - %(levelname)s - %(message)s',
+        force=True)
+
     try:
         for host in hosts:
-            print(f"The {host} IP is {socket.gethostbyname(host)}")
+            logging.info("The %s IP is %s", host, socket.gethostbyname(host))
     except:
         raise RuntimeError(f"Can not get host name of {hosts}")
 
@@ -205,9 +211,9 @@ def run_train(args, unknownargs):
                 sock.connect((master_addr, 12345))
                 break
             except: # pylint: disable=bare-except
-                print(f"Try to connect {master_addr}")
+                logging.info("Try to connect %s", master_addr)
                 time.sleep(10)
-        print("Connected")
+        logging.info("Connected")
 
     # write ip list info into disk
     ip_list_path = os.path.join(data_path, 'ip_list.txt')
@@ -232,8 +238,8 @@ def run_train(args, unknownargs):
     if model_checkpoint_s3 is not None:
         # Download Saved model checkpoint to resume
         download_model(model_checkpoint_s3, restore_model_path, sagemaker_session)
-        print(f"{restore_model_path} {os.listdir(restore_model_path)}")
-
+        logging.info("Successfully download the model into %s.\n The model has: %s.",
+                     restore_model_path, os.listdir(restore_model_path))
 
     err_code = 0
     if host_rank == 0:
@@ -265,18 +271,18 @@ def run_train(args, unknownargs):
             print(e)
             err_code = -1
         terminate_workers(client_list, world_size, task_end)
-        print("Master End")
+        logging.info("Master End")
     else:
         barrier(sock)
         # Block util training finished
         # Listen to end command
         wait_for_exit(sock)
-        print("Worker End")
+        logging.info("Worker End")
 
     sock.close()
     if err_code != 0:
         # Report an error
-        print("Task failed")
+        logging.error("Task failed")
         sys.exit(-1)
 
     # If there are saved models
diff --git a/sagemaker/launch/launch_infer.py b/sagemaker/launch/launch_infer.py
index a6186d8180..725d23da57 100644
--- a/sagemaker/launch/launch_infer.py
+++ b/sagemaker/launch/launch_infer.py
@@ -55,6 +55,7 @@ def run_job(input_args, image, unknownargs):
     output_predict_s3_path = input_args.output_prediction_s3 # S3 location to save prediction results
     model_artifact_s3 = input_args.model_artifact_s3 # S3 location of saved model artifacts
     output_chunk_size = input_args.output_chunk_size # Number of rows per chunked prediction result or node embedding file.
+    log_level = input_args.log_level # SageMaker runner logging level
 
     boto_session = boto3.session.Session(region_name=region)
     sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
@@ -76,7 +77,8 @@ def run_job(input_args, image, unknownargs):
                   "infer-yaml-s3": infer_yaml_s3,
                   "output-emb-s3": output_emb_s3_path,
                   "model-artifact-s3": model_artifact_s3,
-                  "output-chunk-size": output_chunk_size}
+                  "output-chunk-size": output_chunk_size,
+                  "log-level": log_level}
     else:
         params = {"task-type": task_type,
                   "graph-name": graph_name,
@@ -85,7 +87,8 @@ def run_job(input_args, image, unknownargs):
                   "output-emb-s3": output_emb_s3_path,
                   "output-prediction-s3": output_predict_s3_path,
                   "model-artifact-s3": model_artifact_s3,
-                  "output-chunk-size": output_chunk_size}
+                  "output-chunk-size": output_chunk_size,
+                  "log-level": log_level}
     # We must handle cases like
     # --target-etype query,clicks,asin query,search,asin
     # --feat-name ntype0:feat0 ntype1:feat1
@@ -170,6 +173,8 @@ def get_inference_parser():
         help="Relative path to the trained model under <model_artifact_s3>."
              "There can be multiple model checkpoints under"
              "<model_artifact_s3>, this argument is used to choose one.")
+    inference_args.add_argument('--log-level', default='INFO',
+        type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL'])
 
     return parser
 
diff --git a/sagemaker/launch/launch_train.py b/sagemaker/launch/launch_train.py
index 9d1109ab5c..430f04090b 100644
--- a/sagemaker/launch/launch_train.py
+++ b/sagemaker/launch/launch_train.py
@@ -52,6 +52,7 @@ def run_job(input_args, image, unknowargs):
     model_artifact_s3 = input_args.model_artifact_s3 # Where to store model artifacts
     model_checkpoint_to_load = input_args.model_checkpoint_to_load # S3 location of a saved model.
     custom_script = input_args.custom_script # custom_script if any
+    log_level = input_args.log_level # SageMaker runner logging level
 
     boto_session = boto3.session.Session(region_name=region)
     sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
@@ -66,7 +67,8 @@ def run_job(input_args, image, unknowargs):
               "graph-name": graph_name,
               "graph-data-s3": graph_data_s3,
               "train-yaml-s3": train_yaml_s3,
-              "model-artifact-s3": model_artifact_s3}
+              "model-artifact-s3": model_artifact_s3,
+              "log-level": log_level}
     if custom_script is not None:
         params["custom-script"] = custom_script
     if model_checkpoint_to_load is not None:
@@ -143,6 +145,8 @@ def get_train_parser():
     training_args.add_argument("--custom-script", type=str, default=None,
         help="Custom training script provided by a customer to run customer training logic. \
             Please provide the path of the script within the docker image")
+    training_args.add_argument('--log-level', default='INFO',
+        type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL'])
 
     return parser
 
diff --git a/sagemaker/run/infer_entry.py b/sagemaker/run/infer_entry.py
index fa8cb181a8..98e0ac5352 100644
--- a/sagemaker/run/infer_entry.py
+++ b/sagemaker/run/infer_entry.py
@@ -52,6 +52,8 @@ def parse_train_args():
             Please provide the path of the script within the docker image")
     parser.add_argument("--output-chunk-size", type=int, default=100000,
         help="Number of rows per chunked prediction result or node embedding file.")
+    parser.add_argument('--log-level', default='INFO',
+        type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL'])
 
     # following arguments are required to launch a distributed GraphStorm training task
     parser.add_argument('--data-path', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
diff --git a/sagemaker/run/train_entry.py b/sagemaker/run/train_entry.py
index a6af59f7af..ee1930fa5f 100644
--- a/sagemaker/run/train_entry.py
+++ b/sagemaker/run/train_entry.py
@@ -45,6 +45,8 @@ def parse_train_args():
     parser.add_argument("--custom-script", type=str, default=None,
         help="Custom training script provided by a customer to run customer training logic. \
             Please provide the path of the script within the docker image")
+    parser.add_argument('--log-level', default='INFO',
+        type=str, choices=['DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'FATAL'])
 
     # following arguments are required to launch a distributed GraphStorm training task
     parser.add_argument('--data-path', type=str, default=os.environ['SM_CHANNEL_TRAIN'])

From 1ace099edd8300e3681736757efb90976a24b618 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Mon, 27 Nov 2023 18:36:03 -0800
Subject: [PATCH 6/8] Update

---
 python/graphstorm/sagemaker/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index 1b2ce73248..8d9f88abde 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -271,7 +271,9 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
     S3Downloader.download(os.path.join(graph_data_s3, graph_config),
             graph_path, sagemaker_session=sagemaker_session)
     try:
-        logging.info(f"Download graph from {os.path.join(graph_data_s3, graph_part)} to {graph_part_path}")
+        logging.info(f"Download graph from %s to %s",
+                     os.path.join(graph_data_s3, graph_part),
+                     graph_part_path)
         S3Downloader.download(os.path.join(graph_data_s3, graph_part),
             graph_part_path, sagemaker_session=sagemaker_session)
     except Exception as err: # pylint: disable=broad-except
@@ -283,7 +285,9 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
 
     # Try to download node id mapping file if any
     try:
-        logging.info(f"Download graph id mapping from {os.path.join(graph_data_s3, node_id_mapping)} to {graph_path}")
+        logging.info(f"Download graph id mapping from %s to %s",
+                     os.path.join(graph_data_s3, node_id_mapping),
+                     graph_path)
         S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping),
             graph_path, sagemaker_session=sagemaker_session)
     except Exception: # pylint: disable=broad-except

From e04685ccd9af992a52952c0f9abb2eb3da9ff251 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Mon, 27 Nov 2023 22:09:27 -0800
Subject: [PATCH 7/8] Update

---
 python/graphstorm/sagemaker/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index 8d9f88abde..5bda551f58 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -271,7 +271,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
     S3Downloader.download(os.path.join(graph_data_s3, graph_config),
             graph_path, sagemaker_session=sagemaker_session)
     try:
-        logging.info(f"Download graph from %s to %s",
+        logging.info("Download graph from %s to %s",
                      os.path.join(graph_data_s3, graph_part),
                      graph_part_path)
         S3Downloader.download(os.path.join(graph_data_s3, graph_part),
@@ -285,7 +285,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
 
     # Try to download node id mapping file if any
     try:
-        logging.info(f"Download graph id mapping from %s to %s",
+        logging.info("Download graph id mapping from %s to %s",
                      os.path.join(graph_data_s3, node_id_mapping),
                      graph_path)
         S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping),

From 8f560a219c8113d930d6878e5a3afea36c35a7de Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Tue, 28 Nov 2023 14:12:03 -0800
Subject: [PATCH 8/8] Update

---
 python/graphstorm/sagemaker/sagemaker_infer.py | 8 ++++----
 python/graphstorm/sagemaker/sagemaker_train.py | 6 +++---
 python/graphstorm/sagemaker/utils.py           | 9 ++++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/python/graphstorm/sagemaker/sagemaker_infer.py b/python/graphstorm/sagemaker/sagemaker_infer.py
index 8aad18b9d3..89ca5921a3 100644
--- a/python/graphstorm/sagemaker/sagemaker_infer.py
+++ b/python/graphstorm/sagemaker/sagemaker_infer.py
@@ -176,8 +176,8 @@ def run_infer(args, unknownargs):
     # start the ssh server
     subprocess.run(["service", "ssh", "start"], check=True)
 
-    logging.info("Know args %s", args)
-    logging.info("Unknow args %s", unknownargs)
+    logging.info("Known args %s", args)
+    logging.info("Unknown args %s", unknownargs)
 
     train_env = json.loads(args.sm_dist_env)
     hosts = train_env['hosts']
@@ -255,7 +255,7 @@ def run_infer(args, unknownargs):
 
     # Download Saved model
     download_model(model_artifact_s3, model_path, sagemaker_session)
-    logging.info("Successfully download the model into %s.\n The model has: %s.",
+    logging.info("Successfully downloaded the model into %s.\n The model files are: %s.",
                  model_path, os.listdir(model_path))
 
     err_code = 0
@@ -309,7 +309,7 @@ def run_infer(args, unknownargs):
     sock.close()
     if err_code != 0:
         # Report an error
-        logging.info("Task failed")
+        logging.error("Task failed")
         sys.exit(-1)
 
     if args.output_prediction_s3 is not None:
diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py
index fc529b5060..cf36a79aa5 100644
--- a/python/graphstorm/sagemaker/sagemaker_train.py
+++ b/python/graphstorm/sagemaker/sagemaker_train.py
@@ -167,8 +167,8 @@ def run_train(args, unknownargs):
     # start the ssh server
     subprocess.run(["service", "ssh", "start"], check=True)
 
-    logging.info("Know args %s", args)
-    logging.info("Unknow args %s", unknownargs)
+    logging.info("Known args %s", args)
+    logging.info("Unknown args %s", unknownargs)
 
     save_model_path = os.path.join(output_path, "model_checkpoint")
 
@@ -238,7 +238,7 @@ def run_train(args, unknownargs):
     if model_checkpoint_s3 is not None:
         # Download Saved model checkpoint to resume
         download_model(model_checkpoint_s3, restore_model_path, sagemaker_session)
-        logging.info("Successfully download the model into %s.\n The model has: %s.",
+        logging.info("Successfully downloaded the model into %s.\n The model files are: %s.",
                      restore_model_path, os.listdir(restore_model_path))
 
     err_code = 0
diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
index 5bda551f58..f80f7347e8 100644
--- a/python/graphstorm/sagemaker/utils.py
+++ b/python/graphstorm/sagemaker/utils.py
@@ -291,7 +291,10 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
         S3Downloader.download(os.path.join(graph_data_s3, node_id_mapping),
             graph_path, sagemaker_session=sagemaker_session)
     except Exception: # pylint: disable=broad-except
-        logging.warning("node id mapping file does not exist")
+        logging.warning("Node id mapping file does not exist."
+                        "If you are running GraphStorm on a graph with "
+                        "more than 1 partition, it is recommended to provide "
+                        "the node id mapping file created by gconstruct or gsprocessing.")
 
     # Try to get GraphStorm ID to Original ID remaping files if any
     files = S3Downloader.list(graph_data_s3, sagemaker_session=sagemaker_session)
@@ -305,7 +308,7 @@ def download_graph(graph_data_s3, graph_name, part_id, local_path, sagemaker_ses
         except Exception: # pylint: disable=broad-except
             logging.warning("node id remap file %s does not exist", file)
 
-    logging.info("Finish download graph data from %s", graph_data_s3)
+    logging.info("Finished downloading graph data from %s", graph_data_s3)
     return os.path.join(graph_path, graph_config)
 
 
@@ -345,7 +348,7 @@ def upload_model_artifacts(model_s3_path, model_path, sagemaker_session):
     sagemaker_session: sagemaker.session.Session
         sagemaker_session to run download
     """
-    logging.info("Upload model artifacts to %s", model_s3_path)
+    logging.info("Uploading model artifacts to %s", model_s3_path)
     # Rank0 will upload both dense models and learnable embeddings owned by Rank0.
     # Other ranks will only upload learnable embeddings owned by themselves.
     return upload_data_to_s3(model_s3_path, model_path, sagemaker_session)