From 89b542a109e99c481d26ce5431c04b02891ca36b Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Fri, 23 Feb 2024 23:40:14 +0000 Subject: [PATCH] [GSProcessing] Small doc fixes --- .../developer/input-configuration.rst | 6 +++--- docs/source/gs-processing/usage/example.rst | 6 +++--- .../graphstorm_processing/distributed_executor.py | 15 ++++++++++----- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/source/gs-processing/developer/input-configuration.rst b/docs/source/gs-processing/developer/input-configuration.rst index 36b3ccf864..a9753a842b 100644 --- a/docs/source/gs-processing/developer/input-configuration.rst +++ b/docs/source/gs-processing/developer/input-configuration.rst @@ -286,7 +286,7 @@ can contain the following top-level keys: feature values in the data. - ``transformation`` (JSON object, optional): The type of transformation that will be applied to the feature. For details on - the individual transformations supported see :ref:`supported-transformations`. + the individual transformations supported see :ref:`gsp-supported-transformations-ref`. If this key is missing, the feature is treated as a **no-op** feature without ``kwargs``. @@ -294,7 +294,7 @@ can contain the following top-level keys: applied. - ``kwargs`` (JSON object, optional): A dictionary of parameter names and values. Each individual transformation will have its own - supported parameters, described in :ref:`supported-transformations`. + supported parameters, described in :ref:`gsp-supported-transformations-ref`. - ``name`` (String, optional): The name that will be given to the encoded feature. If not given, **column** is used as the output name. @@ -470,7 +470,7 @@ arguments. You can find all models in the `Huggingface model repository `_. - ``max_seq_length`` (Integer, required): Specifies the maximum number of tokens of the input. You can use a length greater than the dataset's longest sentence; or for a safe value choose 128. Make sure to check - the model's max suported length when setting this value, + the model's max suported length when setting this value, -------------- diff --git a/docs/source/gs-processing/usage/example.rst b/docs/source/gs-processing/usage/example.rst index 43125d4d32..3f7feaa719 100644 --- a/docs/source/gs-processing/usage/example.rst +++ b/docs/source/gs-processing/usage/example.rst @@ -32,7 +32,7 @@ that contains the relevant data: Expected file inputs and configuration -------------------------------------- -GSProcessing expects the input files to be in specific format that will allow +GSProcessing expects the input files to be in a specific format that will allow us to perform the processing and prepare the data for partitioning and training. The data files are expected to be: @@ -40,9 +40,9 @@ The data files are expected to be: * Tabular data files. We support CSV-with-header format, or in Parquet format. The files can be split (multiple parts), or a single file. * Available on a local file system or on S3. -* One tabular file source per edge and node type. For example, for a particular edge +* One prefix per edge and node type. For example, for a particular edge type, all node identifiers (source, destination), features, and labels should - exist as columns in a single file source. + exist as columns in one or more files under a common prefix (local or on S3). Apart from the data, GSProcessing also requires a configuration file that describes the data and the transformations we will need to apply to the features and any encoding needed for diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index 791ef5d939..5b22ee2b8b 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -180,7 +180,7 @@ def __init__( "graph" ] else: - logging.warning("Unrecognized version name: %s", config_version) + logging.warning("Unrecognized configuration file version name: %s", config_version) try: converter = GConstructConfigConverter() self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[ @@ -192,8 +192,10 @@ def __init__( "graph" in dataset_config_dict ), "Top-level element 'graph' needs to exist in a GSProcessing config" self.graph_config_dict = dataset_config_dict["graph"] + logging.info("Parsed config file as GSProcessing config") else: # Older versions of GConstruct configs might be missing a version entry + logging.warning("No configuration file version name, trying to parse as GConstruct...") converter = GConstructConfigConverter() self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)["graph"] @@ -263,7 +265,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--config-filename", type=str, - help="GSProcessing data configuration filename.", + help="GConstruct or GSProcessing data configuration filename.", required=True, ) parser.add_argument( @@ -309,9 +311,12 @@ def main(): is_sagemaker_execution = os.path.exists("/opt/ml/config/processingjobconfig.json") if gsprocessing_args.input_prefix.startswith("s3://"): - assert gsprocessing_args.output_prefix.startswith( - "s3://" - ), "When providing S3 input and output prefixes, they must both be S3." + assert gsprocessing_args.output_prefix.startswith("s3://"), ( + "When providing S3 input and output prefixes, they must both be S3 URIs, got: " + f"input: '{gsprocessing_args.input_prefix}' " + f"and output: '{gsprocessing_args.output_prefix}'." + ) + filesystem_type = "s3" else: # Ensure input and output prefixes exist and convert to absolute paths