From 0546046dcab5becf8b6dabd14bfdf25401f40d0c Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 26 Jul 2024 20:27:29 +0000 Subject: [PATCH 01/12] change --- .../config_conversion/gconstruct_converter.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index c60e70a4e6..6aebcf5880 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -67,6 +67,11 @@ def _convert_label(labels: list[dict]) -> list[dict]: } else: label_custom_split_filenames = label["custom_split_filenames"] + if isinstance(label_custom_split_filenames["column"], list): + assert len(label_custom_split_filenames["column"]) <= 2, ( + "Custom split filenames should have one column for node labels, " + "and two columns for edges labels exactly" + ) label_dict["custom_split_filenames"] = { "train": label_custom_split_filenames["train"], "valid": label_custom_split_filenames["valid"], @@ -76,6 +81,11 @@ def _convert_label(labels: list[dict]) -> list[dict]: if "separator" in label: label_sep = label["separator"] label_dict["separator"] = label_sep + # Not supported for multi-task config for GSProcessing + assert "mask_field_names" not in label, ( + "GSProcessing currently do not support to " + "construct labels for multi-task learning" + ) labels_list.append(label_dict) except KeyError as exc: raise KeyError(f"A required key was missing from label input {label}") from exc @@ -103,6 +113,10 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: gsp_feat_dict["column"] = gconstruct_feat_dict["feature_col"] elif isinstance(gconstruct_feat_dict["feature_col"], list): gsp_feat_dict["column"] = gconstruct_feat_dict["feature_col"][0] + assert "feature_name" in gconstruct_feat_dict, ( + "feature_name should be in the gconstruct " + "feature field when feature_col is a list" + ) if "feature_name" in gconstruct_feat_dict: gsp_feat_dict["name"] = gconstruct_feat_dict["feature_name"] @@ -183,9 +197,10 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: gsp_transformation_dict["name"] = "no-op" if "out_dtype" in gconstruct_feat_dict: - assert ( - gconstruct_feat_dict["out_dtype"] == "float32" - ), "GSProcessing currently only supports float32 features" + assert gconstruct_feat_dict["out_dtype"] in ( + "float32", + "float64", + ), "GSProcessing currently only supports float32 or float64 features" gsp_feat_dict["transformation"] = gsp_transformation_dict gsp_feats_list.append(gsp_feat_dict) From 284c57d95d2608340fa834a8781097df165bab6b Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 26 Jul 2024 20:46:42 +0000 Subject: [PATCH 02/12] change --- .../config/config_conversion/gconstruct_converter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 6aebcf5880..14bffd2e31 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -264,6 +264,8 @@ def convert_edges(edges_entries): # format edge_format = e["format"]["name"] + assert edge_format in ("parquet", "csv"), \ + "GSProcessing only supports parquet files and csv files." if "separator" not in e["format"]: edge_separator = None else: From 020ef73bb018a8c3bad86932a555b5411ddac974 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 26 Jul 2024 20:47:54 +0000 Subject: [PATCH 03/12] change --- .../config/config_conversion/gconstruct_converter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 14bffd2e31..a4ae4d3dd3 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -215,6 +215,8 @@ def convert_nodes(nodes_entries): node_type, node_col = n["node_type"], n["node_id_col"] # format node_format = n["format"]["name"] + assert node_format in ("parquet", "csv"), \ + "GSProcessing only supports parquet files and csv files." if "separator" not in n["format"]: node_separator = None else: From 8775fa693c83ac9da13f3a39c9da62af596fbace Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 26 Jul 2024 22:19:45 +0000 Subject: [PATCH 04/12] black config --- .../config/config_conversion/gconstruct_converter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index a4ae4d3dd3..5a5a09bb47 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -215,8 +215,10 @@ def convert_nodes(nodes_entries): node_type, node_col = n["node_type"], n["node_id_col"] # format node_format = n["format"]["name"] - assert node_format in ("parquet", "csv"), \ - "GSProcessing only supports parquet files and csv files." + assert node_format in ( + "parquet", + "csv", + ), "GSProcessing only supports parquet files and csv files." if "separator" not in n["format"]: node_separator = None else: @@ -266,8 +268,10 @@ def convert_edges(edges_entries): # format edge_format = e["format"]["name"] - assert edge_format in ("parquet", "csv"), \ - "GSProcessing only supports parquet files and csv files." + assert edge_format in ( + "parquet", + "csv", + ), "GSProcessing only supports parquet files and csv files." if "separator" not in e["format"]: edge_separator = None else: From f3d372b53fdc8dd5f059122e7a4c93d0c0e39a0f Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 26 Jul 2024 22:35:03 +0000 Subject: [PATCH 05/12] black config --- .../config/config_conversion/gconstruct_converter.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 5a5a09bb47..24dd427434 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -113,10 +113,11 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: gsp_feat_dict["column"] = gconstruct_feat_dict["feature_col"] elif isinstance(gconstruct_feat_dict["feature_col"], list): gsp_feat_dict["column"] = gconstruct_feat_dict["feature_col"][0] - assert "feature_name" in gconstruct_feat_dict, ( - "feature_name should be in the gconstruct " - "feature field when feature_col is a list" - ) + if len(gconstruct_feat_dict["feature_col"]) >= 2: + assert "feature_name" in gconstruct_feat_dict, ( + "feature_name should be in the gconstruct " + "feature field when feature_col is a list" + ) if "feature_name" in gconstruct_feat_dict: gsp_feat_dict["name"] = gconstruct_feat_dict["feature_name"] From b217b4b5e9c22a2012bd92193c2d9f595723a3b7 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 30 Jul 2024 22:08:52 +0000 Subject: [PATCH 06/12] change --- .../config_conversion/gconstruct_converter.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 24dd427434..f813771f1d 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -18,6 +18,8 @@ from typing import Any from collections.abc import Mapping +from graphstorm_processing.constants import SUPPORTED_FILE_TYPES, VALID_OUTDTYPE + from .converter_base import ConfigConverter from .meta_configuration import NodeConfig, EdgeConfig @@ -216,10 +218,8 @@ def convert_nodes(nodes_entries): node_type, node_col = n["node_type"], n["node_id_col"] # format node_format = n["format"]["name"] - assert node_format in ( - "parquet", - "csv", - ), "GSProcessing only supports parquet files and csv files." + assert node_format in SUPPORTED_FILE_TYPES, \ + "GSProcessing only supports parquet files and csv files." if "separator" not in n["format"]: node_separator = None else: @@ -269,10 +269,8 @@ def convert_edges(edges_entries): # format edge_format = e["format"]["name"] - assert edge_format in ( - "parquet", - "csv", - ), "GSProcessing only supports parquet files and csv files." + assert edge_format in SUPPORTED_FILE_TYPES, \ + "GSProcessing only supports parquet files and csv files." if "separator" not in e["format"]: edge_separator = None else: From 5e7c2fc0c8b9761bb53059cf765dac1a86ee8362 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 30 Jul 2024 22:12:42 +0000 Subject: [PATCH 07/12] Change --- .../config/config_conversion/gconstruct_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index f813771f1d..47fc655f1c 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -134,7 +134,7 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: "imputer": "none", } - if gconstruct_transform_dict.get("out_dtype") in ["float32", "float64"]: + if gconstruct_transform_dict.get("out_dtype") in VALID_OUTDTYPE: gsp_transformation_dict["kwargs"]["out_dtype"] = gconstruct_transform_dict[ "out_dtype" ] From 777b5898caa55ad62eda6e0effdebcca3fbdca02 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 30 Jul 2024 22:13:44 +0000 Subject: [PATCH 08/12] change --- .../config/config_conversion/gconstruct_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 47fc655f1c..0b99552300 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -85,7 +85,7 @@ def _convert_label(labels: list[dict]) -> list[dict]: label_dict["separator"] = label_sep # Not supported for multi-task config for GSProcessing assert "mask_field_names" not in label, ( - "GSProcessing currently do not support to " + "GSProcessing currently cannot " "construct labels for multi-task learning" ) labels_list.append(label_dict) From 86dd7acbcb63bd612f8cfeeefe1a84ef526ce83e Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 30 Jul 2024 22:14:47 +0000 Subject: [PATCH 09/12] change --- .../config/config_conversion/gconstruct_converter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 0b99552300..bc9259cff1 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -200,10 +200,8 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: gsp_transformation_dict["name"] = "no-op" if "out_dtype" in gconstruct_feat_dict: - assert gconstruct_feat_dict["out_dtype"] in ( - "float32", - "float64", - ), "GSProcessing currently only supports float32 or float64 features" + assert gconstruct_feat_dict["out_dtype"] in VALID_OUTDTYPE, \ + "GSProcessing currently only supports float32 or float64 features" gsp_feat_dict["transformation"] = gsp_transformation_dict gsp_feats_list.append(gsp_feat_dict) From 9ff46daf9b5bc079a11692feb1c99806ee5f28cb Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 30 Jul 2024 22:34:45 +0000 Subject: [PATCH 10/12] test --- .../config_conversion/gconstruct_converter.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index bc9259cff1..7b0cec3ce5 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -85,8 +85,7 @@ def _convert_label(labels: list[dict]) -> list[dict]: label_dict["separator"] = label_sep # Not supported for multi-task config for GSProcessing assert "mask_field_names" not in label, ( - "GSProcessing currently cannot " - "construct labels for multi-task learning" + "GSProcessing currently cannot " "construct labels for multi-task learning" ) labels_list.append(label_dict) except KeyError as exc: @@ -200,8 +199,9 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: gsp_transformation_dict["name"] = "no-op" if "out_dtype" in gconstruct_feat_dict: - assert gconstruct_feat_dict["out_dtype"] in VALID_OUTDTYPE, \ - "GSProcessing currently only supports float32 or float64 features" + assert ( + gconstruct_feat_dict["out_dtype"] in VALID_OUTDTYPE + ), "GSProcessing currently only supports float32 or float64 features" gsp_feat_dict["transformation"] = gsp_transformation_dict gsp_feats_list.append(gsp_feat_dict) @@ -216,8 +216,9 @@ def convert_nodes(nodes_entries): node_type, node_col = n["node_type"], n["node_id_col"] # format node_format = n["format"]["name"] - assert node_format in SUPPORTED_FILE_TYPES, \ - "GSProcessing only supports parquet files and csv files." + assert ( + node_format in SUPPORTED_FILE_TYPES + ), "GSProcessing only supports parquet files and csv files." if "separator" not in n["format"]: node_separator = None else: @@ -267,8 +268,9 @@ def convert_edges(edges_entries): # format edge_format = e["format"]["name"] - assert edge_format in SUPPORTED_FILE_TYPES, \ - "GSProcessing only supports parquet files and csv files." + assert ( + edge_format in SUPPORTED_FILE_TYPES + ), "GSProcessing only supports parquet files and csv files." if "separator" not in e["format"]: edge_separator = None else: From c56e61a9398812e07a812f6358f6a2cfbd43e6e1 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 31 Jul 2024 21:30:54 +0000 Subject: [PATCH 11/12] add test --- graphstorm-processing/tests/test_converter.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py index 84a56ce6fd..f3fd69018c 100644 --- a/graphstorm-processing/tests/test_converter.py +++ b/graphstorm-processing/tests/test_converter.py @@ -65,6 +65,65 @@ def test_try_read_unsupported_feature(converter: GConstructConfigConverter, node _ = converter.convert_nodes(node_dict["nodes"]) +def test_try_read_invalid_gconstruct_config(converter: GConstructConfigConverter, node_dict: dict): + """Custom Split Columns""" + node_dict["nodes"][0]["labels"] = [ + { + "label_col": "label", + "task_type": "classification", + "custom_split_filenames": { + "column": ["src", "dst", "inter"], + }, + "label_stats_type": "frequency_cnt", + } + ] + + with pytest.raises(AssertionError): + _ = converter.convert_nodes(node_dict["nodes"]) + + """Feature Name must exist for multiple feature columns""" + node_dict["nodes"][0]["features"] = [ + { + "feature_col": ["feature_1", "feature_2"] + } + ] + + with pytest.raises(AssertionError): + _ = converter.convert_nodes(node_dict["nodes"]) + + """Unsupported output dtype""" + node_dict["nodes"][0]["features"] = [ + { + "feature_col": ["feature_1"], + "out_dtype": "float16" + } + ] + + with pytest.raises(AssertionError): + _ = converter.convert_nodes(node_dict["nodes"]) + + """Unsupported format type""" + node_dict["nodes"][0]["format"] = \ + {"name": "txt", "separator": ","} + + with pytest.raises(AssertionError): + _ = converter.convert_nodes(node_dict["nodes"]) + + +def test_try_read_multi_task_gconstruct_config(converter: GConstructConfigConverter, node_dict: dict): + """Check unsupported mask column """ + node_dict["nodes"][0]["labels"] = [ + { + "label_col": "label", + "task_type": "classification", + "mask_field_names": "train_mask" + } + ] + + with pytest.raises(AssertionError): + _ = converter.convert_nodes(node_dict["nodes"]) + + @pytest.mark.parametrize("transform", ["max_min_norm", "rank_gauss"]) @pytest.mark.parametrize("out_dtype", ["float16", "float32", "float64"]) def test_try_convert_out_dtype( From 38702cc36f42452ded1f7ed8deeb207b08b593d2 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 31 Jul 2024 21:39:37 +0000 Subject: [PATCH 12/12] lint --- graphstorm-processing/tests/test_converter.py | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py index f3fd69018c..7b1752950b 100644 --- a/graphstorm-processing/tests/test_converter.py +++ b/graphstorm-processing/tests/test_converter.py @@ -82,42 +82,30 @@ def test_try_read_invalid_gconstruct_config(converter: GConstructConfigConverter _ = converter.convert_nodes(node_dict["nodes"]) """Feature Name must exist for multiple feature columns""" - node_dict["nodes"][0]["features"] = [ - { - "feature_col": ["feature_1", "feature_2"] - } - ] + node_dict["nodes"][0]["features"] = [{"feature_col": ["feature_1", "feature_2"]}] with pytest.raises(AssertionError): _ = converter.convert_nodes(node_dict["nodes"]) """Unsupported output dtype""" - node_dict["nodes"][0]["features"] = [ - { - "feature_col": ["feature_1"], - "out_dtype": "float16" - } - ] + node_dict["nodes"][0]["features"] = [{"feature_col": ["feature_1"], "out_dtype": "float16"}] with pytest.raises(AssertionError): _ = converter.convert_nodes(node_dict["nodes"]) """Unsupported format type""" - node_dict["nodes"][0]["format"] = \ - {"name": "txt", "separator": ","} + node_dict["nodes"][0]["format"] = {"name": "txt", "separator": ","} with pytest.raises(AssertionError): _ = converter.convert_nodes(node_dict["nodes"]) -def test_try_read_multi_task_gconstruct_config(converter: GConstructConfigConverter, node_dict: dict): - """Check unsupported mask column """ +def test_try_read_multi_task_gconstruct_config( + converter: GConstructConfigConverter, node_dict: dict +): + """Check unsupported mask column""" node_dict["nodes"][0]["labels"] = [ - { - "label_col": "label", - "task_type": "classification", - "mask_field_names": "train_mask" - } + {"label_col": "label", "task_type": "classification", "mask_field_names": "train_mask"} ] with pytest.raises(AssertionError):