diff --git a/docs/source/cli/graph-construction/distributed/gspartition/ec2-clusters.rst b/docs/source/cli/graph-construction/distributed/gspartition/ec2-clusters.rst index 8d10753978..a338e51ab5 100644 --- a/docs/source/cli/graph-construction/distributed/gspartition/ec2-clusters.rst +++ b/docs/source/cli/graph-construction/distributed/gspartition/ec2-clusters.rst @@ -129,6 +129,7 @@ The arguments that ``graphstorm.gpartition.dist_partition_graph`` accepts are th * ``--ip-config str``: A file storing a list of IPs, one line for each instance of the partition cluster. * ``--partition-assignment-only``: Only generate partition assignments for nodes, the process will not build the partitioned DGL graph. * ``--logging-level str``: The logging level. The possible values: debug, info, warning, error. The default value is info. (default: info) +* ``--process-group-timeout``: Timeout[seconds] for operations executed against the process group. The default value is 1800. * ``--use-graphbolt "true"/"false"``: ``New in v0.4``. Whether to convert the partitioned data to the GraphBolt format after creating the DistDGL graph. Requires installed DGL version to be at least ``2.1.0``. See :ref:`using-graphbolt-ref` for an example. (default: ``"false"``) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index fffddbe738..11c5ff7440 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -50,7 +50,8 @@ def run_build_dglgraph( output_path, metadata_filename, dgl_tool_path, - ssh_port): + ssh_port, + process_group_timeout): """ Build DistDGL Graph Parameters @@ -67,6 +68,8 @@ def run_build_dglgraph( The filename for the graph partitioning metadata file we'll use to determine data sources. ssh_port: int SSH port + process_group_timeout: int + Timeout[seconds] for operations executed against the process group. """ # Get the python interpreter used right now. # If we can not get it we go with the default `python3` @@ -84,6 +87,7 @@ def run_build_dglgraph( "--ssh-port", f"{ssh_port}", "--python-path", f"{python_bin}", "--log-level", logging.getLevelName(logging.root.getEffectiveLevel()), + "--process-group-timeout", str(process_group_timeout), "--save-orig-nids", "--save-orig-eids"] @@ -153,7 +157,8 @@ def main(): os.path.join(output_path, "dist_graph"), args.metadata_filename, args.dgl_tool_path, - args.ssh_port) + args.ssh_port, + args.process_group_timeout) logging.info("DGL graph building took %f sec", dgl_graph_start - time.time()) @@ -236,6 +241,9 @@ def parse_args() -> argparse.Namespace: default="false", help=("Whether to convert the partitioned data to the GraphBolt format " "after creating the DistDGL graph.")) + argparser.add_argument("--process-group-timeout", type=int, default=1800, + help="Timeout[seconds] for operations executed " + "against the process group.") return argparser.parse_args() diff --git a/tests/end2end-tests/graphbolt-gs-integration/graphbolt-graph-construction.sh b/tests/end2end-tests/graphbolt-gs-integration/graphbolt-graph-construction.sh index 4e32aed053..c73f197b82 100644 --- a/tests/end2end-tests/graphbolt-gs-integration/graphbolt-graph-construction.sh +++ b/tests/end2end-tests/graphbolt-gs-integration/graphbolt-graph-construction.sh @@ -126,7 +126,8 @@ python3 -m graphstorm.gpartition.dist_partition_graph \ --num-parts 2 \ --output-path "$DIST_GRAPHBOLT_PATH" \ --ssh-port 2222 \ - --use-graphbolt "true" + --use-graphbolt "true" \ + --process-group-timeout 3600 # Ensure GraphBolt files were created by GSPartition for i in $(seq 0 1); do