From c2794c4745dab907c36785bb0a0d48145adf07af Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 26 Sep 2024 11:13:20 -0700 Subject: [PATCH] Update --- .../graph-construction/single-machine-gconstruct.rst | 2 +- .../model-training-inference/distributed/sagemaker.rst | 2 +- .../graphstorm_processing/distributed_executor.py | 9 ++++++--- python/graphstorm/gconstruct/construct_graph.py | 3 ++- python/graphstorm/utils.py | 10 +++++++--- tests/unit-tests/test_gsf.py | 4 ++++ 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/source/cli/graph-construction/single-machine-gconstruct.rst b/docs/source/cli/graph-construction/single-machine-gconstruct.rst index d0d75c2d32..f0d7849c61 100644 --- a/docs/source/cli/graph-construction/single-machine-gconstruct.rst +++ b/docs/source/cli/graph-construction/single-machine-gconstruct.rst @@ -33,7 +33,7 @@ Full argument list of the ``gconstruct.construct_graph`` command * **-\-num-processes-for-nodes**: the number of processes to process node data simultaneously. Increase this number can speed up node data processing. * **-\-num-processes-for-edges**: the number of processes to process edge data simultaneously. Increase this number can speed up edge data processing. * **-\-output-dir**: (**Required**) the path of the output data files. -* **-\-graph-name**: (**Required**) the name assigned for the graph. The graph name must adhere to the Python identifier naming rules with the exception that hyphens (``-``) are permitted. +* **-\-graph-name**: (**Required**) the name assigned for the graph. The graph name must adhere to the Python identifier naming rules with the exception that hyphens (``-``) are permitted and the name can start with numbers. * **-\-remap-node-id**: boolean value to decide whether to rename node IDs or not. Adding this argument will set it to be true, otherwise false. * **-\-add-reverse-edges**: boolean value to decide whether to add reverse edges for the given graph. Adding this argument sets it to true; otherwise, it defaults to false. It is **strongly** suggested to include this argument for graph construction, as some nodes in the original data may not have in-degrees, and thus cannot update their presentations by aggregating messages from their neighbors. Adding this arugment helps prevent this issue. * **-\-output-format**: the format of constructed graph, options are ``DGL``, ``DistDGL``. Default is ``DistDGL``. It also accepts multiple graph formats at the same time separated by an space, for example ``--output-format "DGL DistDGL"``. The output format is explained in the :ref:`Output ` section above. diff --git a/docs/source/cli/model-training-inference/distributed/sagemaker.rst b/docs/source/cli/model-training-inference/distributed/sagemaker.rst index 5b111726cb..ab1c892271 100644 --- a/docs/source/cli/model-training-inference/distributed/sagemaker.rst +++ b/docs/source/cli/model-training-inference/distributed/sagemaker.rst @@ -388,7 +388,7 @@ The rest of the arguments are passed on to ``sagemaker_train.py`` or ``sagemaker * **--task-type**: Task type. * **--graph-data-s3**: S3 location of the input graph. -* **--graph-name**: Name of the input graph. +* **--graph-name**: Name of the input graph. The graph name must adhere to the Python identifier naming rules with the exception that hyphens (``-``) are permitted and the name can start with numbers. * **--yaml-s3**: S3 location of yaml file for training and inference. * **--custom-script**: Custom training script provided by customers to run customer training logic. This should be a path to the Python script within the Docker image. * **--output-emb-s3**: S3 location to store GraphStorm generated node embeddings. This is an inference only argument. diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index ffe2bd8a1a..295ccad60b 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -543,7 +543,8 @@ def parse_args() -> argparse.Namespace: help="Name for the graph being processed." "The graph name must adhere to the Python " "identifier naming rules with the exception " - "that hyphens (-) are permitted.", + "that hyphens (-) are permitted and the name " + "can start with numbers", required=False, default=None, ) @@ -573,7 +574,8 @@ def check_graph_name(graph_name): We enforce that the graph name adheres to the Python identifier naming rules as in https://docs.python.org/3/reference/lexical_analysis.html#identifiers, - with the exception that hyphens (-) are permitted. + with the exception that hyphens (-) are permitted + and the name can start with numbers. This helps avoid the cases when an invalid graph name, such as `/graph`, causes unexpected errors. @@ -587,7 +589,8 @@ def check_graph_name(graph_name): assert graph_name.replace("-", "_").isidentifier(), ( "GraphStorm expects the graph name adheres to the Python" "identifier naming rules with the exception that hyphens " - f"(-) are permitted. But we get {graph_name}" + f"(-) are permitted and the name can start with numbers. " + "But we get {graph_name}" ) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 56d9b4b64e..ceaee5adfd 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -913,7 +913,8 @@ def process_graph(args): help="Name for the graph being processed." "The graph name must adhere to the Python " "identifier naming rules with the exception " - "that hyphens (-) are permitted.",) + "that hyphens (-) are permitted " + "and the name can start with numbers",) argparser.add_argument("--remap-node-id", action='store_true', help="Whether or not to remap node IDs.") argparser.add_argument("--add-reverse-edges", action='store_true', diff --git a/python/graphstorm/utils.py b/python/graphstorm/utils.py index a58c9daafd..72bb6465e6 100644 --- a/python/graphstorm/utils.py +++ b/python/graphstorm/utils.py @@ -21,6 +21,7 @@ import resource import logging import psutil +import re import pandas as pd import dgl @@ -37,7 +38,8 @@ def check_graph_name(graph_name): We enforce that the graph name adheres to the Python identifier naming rules as in https://docs.python.org/3/reference/lexical_analysis.html#identifiers, - with the exception that hyphens (-) are permitted. + with the exception that hyphens (-) are permitted + and the name can start with numbers. This helps avoid the cases when an invalid graph name, such as `/graph`, causes unexpected errors. @@ -46,10 +48,12 @@ def check_graph_name(graph_name): graph_name: str Graph Name. """ - assert graph_name.replace('-', '_').isidentifier(), \ + gname = re.sub(r'^\d+', '', graph_name) + assert gname.replace('-', '_').isidentifier(), \ "GraphStorm expects the graph name adheres to the Python" \ "identifier naming rules with the exception that hyphens " \ - f"(-) are permitted. But we get {graph_name}" + f"(-) are permitted and the name can start with numbers. " \ + "But we get {graph_name}" def get_graph_name(part_config): """ Get graph name from graph partition config file diff --git a/tests/unit-tests/test_gsf.py b/tests/unit-tests/test_gsf.py index 6be5958788..1f3e1d58d0 100644 --- a/tests/unit-tests/test_gsf.py +++ b/tests/unit-tests/test_gsf.py @@ -457,6 +457,10 @@ def test_check_graph_name(): check_graph_name(graph_name) graph_name = "graph-name" check_graph_name(graph_name) + graph_name = "123-graph-name" + check_graph_name(graph_name) + graph_name = "_Graph-name" + check_graph_name(graph_name) # test with invalid graph name graph_name = "/graph_name"