Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GSProcessing] Update GSProcessing custom split config parsing and documentation to match GConstruct. #1117

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,23 @@ objects:
assign to the validation set [0.0, 1.0).
- ``test``: The percentage of the data with available labels to
assign to the test set [0.0, 1.0).
- ``custom_split_filenames`` (JSON object, optional): Specifies the customized
training/validation/test mask. Once it is defined, GSProcessing will ignore
the ``split_rate``.
- ``train``: Path of the training mask parquet file such that each line contains
the original ID for node tasks, or the pair [source_id, destination_id] for edge tasks.
- ``val``: Path of the validation mask parquet file such that each line contains
the original ID for node tasks, or the pair [source_id, destination_id] for edge tasks.
- ``test``: Path of the test mask parquet file such that each line contains
the original ID for node tasks, or the pair [source_id, destination_id] for edge tasks.
- ``custom_split_filenames`` (JSON object, optional): Specifies pre-assigned
training/validation/test masks. If defined, GSProcessing will ignore
``split_rate`` if provided.

- ``column``: (List[String], optional) A list of length one for node splits, or two for edge splits,
containing the name(s) of the column(s) that contain the node/edge ids for each split. For example,
if the node ids to include in each split exist in column ``"nid"`` of the custom train/val/test files, this
needs to be ``["nid"]``. For edges it would be a value like ``["src_id", "dst_id"]``.
If not provided for nodes we assume the first column in the data contains the node ids to include.
For edges, we assume the first column is the source id and the second the destination id.
- ``train``: (List[String], optional) Paths of the training mask parquet file such that each line contains
the original ID for node tasks, or the pair ``[source_id, destination_id]`` for edge tasks.
- ``val``: (List[String], optional) Paths of the validation mask parquet file such that each line contains
the original ID for node tasks, or the pair ``[source_id, destination_id]`` for edge tasks.
- ``test``: (List[String], optional) Paths of the test mask parquet file such that each line contains
the original ID for node tasks, or the pair ``[source_id, destination_id]`` for edge tasks.
- Note: At least one of the ``["train", "val", "test"]`` keys must be present.

- ``features`` (List of JSON objects, optional)\ **:** Describes
the set of features for the current edge type. See the :ref:`features-object` section for details.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _convert_label(labels: list[dict]) -> list[dict]:
labels_list = []
if labels in [[], [{}]]:
return []
for label in labels:
for label in labels: # pylint: disable=too-many-nested-blocks
try:
label_column = label["label_col"] if "label_col" in label else ""
label_type = label["task_type"]
Expand All @@ -61,19 +61,39 @@ def _convert_label(labels: list[dict]) -> list[dict]:
# check if split_pct is valid
assert (
math.fsum(label_splitrate) == 1.0
), "sum of the label split rate should be ==1.0"
), f"sum of the label split rate should be == 1.0, got {label_splitrate}"
label_dict["split_rate"] = {
"train": label_splitrate[0],
"val": label_splitrate[1],
"test": label_splitrate[2],
}
else:
label_custom_split_filenames = label["custom_split_filenames"]
if isinstance(label_custom_split_filenames["column"], list):
assert len(label_custom_split_filenames["column"]) <= 2, (
"Custom split filenames should have one column for node labels, "
"and two columns for edges labels exactly"
)
# Ensure at least one of ["train", "valid", "test"] is in the keys
assert any(
x in label_custom_split_filenames.keys() for x in ["train", "valid", "test"]
), (
"At least one of ['train', 'valid', 'test'] "
"needs to exist in custom split configs."
)

# Fill in missing values if needed
for entry in ["train", "valid", "test", "column"]:
entry_val = label_custom_split_filenames.get(entry, None)
if entry_val:
if isinstance(entry_val, str):
label_custom_split_filenames[entry] = [entry_val]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this way do we want to support string input for gsprocessing config?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, since GConstruct supports both, we use this to convert to list[str] which is our expectation for GSProcessing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. But I suppose the code here only convert the gconstruct config, do we also support the config in gsprocessing config?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer having a single way to do things to avoid confusing users. This goes along with the Python principle of "There should be one-- and preferably only one --obvious way to do it." https://peps.python.org/pep-0020/

else:
assert isinstance(
entry_val, list
), "Custom split filenames should be a string or a list of strings"
else:
label_custom_split_filenames[entry] = []
assert len(label_custom_split_filenames["column"]) <= 2, (
"Custom split filenames should have one column for node labels, "
"and two columns for edges labels exactly"
)

label_dict["custom_split_filenames"] = {
"train": label_custom_split_filenames["train"],
"valid": label_custom_split_filenames["valid"],
Expand Down Expand Up @@ -213,7 +233,7 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]:
return gsp_feats_list

@staticmethod
def convert_nodes(nodes_entries):
def convert_nodes(nodes_entries) -> list[NodeConfig]:
res = []
for n in nodes_entries:
# type, column id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ def __init__(self, edge_dict: Dict[str, Dict], data_dict: Dict[str, Any]):
self._dst_ntype = edge_dict["dest"]["type"]
self._rel_type = edge_dict["relation"]["type"]
self._rel_col: Optional[str] = edge_dict["relation"].get("column", None)
self._feature_configs = []
self._labels = []

if "features" in edge_dict:
for feature_dict in edge_dict["features"]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,24 @@ def _sanity_check(self):
"When no label column is specified, the task type must be link_prediction, "
f"got {self._task_type}"
)
# Sanity checks for random and custom splits
if "custom_split_filenames" not in self._config:
assert isinstance(self._task_type, str)
assert isinstance(self._split, dict)
assert isinstance(self._separator, str) if self._multilabel else self._separator is None
else:
assert isinstance(self._custom_split_filenames, dict)
assert "train" in self._custom_split_filenames
assert "valid" in self._custom_split_filenames
assert "test" in self._custom_split_filenames
assert "column" in self._custom_split_filenames
assert isinstance(self._separator, str) if self._multilabel else self._separator is None

# Ensure at least one of ["train", "valid", "test"] is in the keys
assert any(x in self._custom_split_filenames.keys() for x in ["train", "valid", "test"])

for entry in ["train", "valid", "test", "column"]:
# All existing entries must be lists of strings
if entry in self._custom_split_filenames:
assert isinstance(self._custom_split_filenames[entry], list)
assert all(isinstance(x, str) for x in self._custom_split_filenames[entry])

assert isinstance(self._separator, str) if self._multilabel else self._separator is None

if self._mask_field_names:
assert isinstance(self._mask_field_names, list)
assert all(isinstance(x, str) for x in self._mask_field_names)
Expand Down Expand Up @@ -111,7 +118,11 @@ def custom_split_filenames(self) -> Dict[str, Any]:
@property
def mask_field_names(self) -> Optional[tuple[str, str, str]]:
"""Custom names to assign to masks for multi-task learning."""
return tuple(self._mask_field_names) if self._mask_field_names else None
if self._mask_field_names is None:
return None
else:
assert len(self._mask_field_names) == 3
return (self._mask_field_names[0], self._mask_field_names[1], self._mask_field_names[2])


class EdgeLabelConfig(LabelConfig):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,19 @@ class CustomSplit:

Parameters
----------
train : str
Path of the training mask parquet file.
valid : str
Path of the validation mask parquet file.
test : str
Path of the testing mask parquet file.
train : list[str]
thvasilo marked this conversation as resolved.
Show resolved Hide resolved
Paths of the training mask parquet files.
valid : list[str]
Paths of the validation mask parquet files.
test : list[str]
Paths of the testing mask parquet files.
mask_columns : list[str]
List of columns that contain original string ids.
"""

train: str
valid: str
test: str
train: list[str]
valid: list[str]
test: list[str]
mask_columns: list[str]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def __init__(
assert os.path.isabs(loader_config.input_prefix), "We expect an absolute path"
self.filesystem_type = "local"

self.spark = spark # type: SparkSession
self.spark: SparkSession = spark
self.add_reverse_edges = loader_config.add_reverse_edges
# Remove trailing slash in s3 paths
if self.filesystem_type == "s3":
Expand Down Expand Up @@ -2131,11 +2131,11 @@ def create_mapping(input_df):
return return_df

if mask_type == "train":
file_path = split_file.train
file_paths = split_file.train
elif mask_type == "val":
file_path = split_file.valid
file_paths = split_file.valid
elif mask_type == "test":
file_path = split_file.test
file_paths = split_file.test
else:
raise ValueError("Unknown mask type")

Expand All @@ -2144,7 +2144,7 @@ def create_mapping(input_df):
if len(split_file.mask_columns) == 1:
# custom split on node original id
custom_mask_df = self.spark.read.parquet(
os.path.join(self.input_prefix, file_path)
*[os.path.join(self.input_prefix, file_path) for file_path in file_paths]
).select(col(split_file.mask_columns[0]).alias(f"custom_{mask_type}_mask"))
input_df_id = create_mapping(input_df)
mask_df = input_df_id.join(
Expand All @@ -2162,7 +2162,7 @@ def create_mapping(input_df):
elif len(split_file.mask_columns) == 2:
# custom split on edge (srd, dst) original ids
custom_mask_df = self.spark.read.parquet(
os.path.join(self.input_prefix, file_path)
*[os.path.join(self.input_prefix, file_path) for file_path in file_paths]
).select(
col(split_file.mask_columns[0]).alias(f"custom_{mask_type}_mask_src"),
col(split_file.mask_columns[1]).alias(f"custom_{mask_type}_mask_dst"),
Expand All @@ -2184,7 +2184,10 @@ def create_mapping(input_df):
.alias(mask_name),
).select(mask_name)
else:
raise ValueError("The number of column should be only 1 or 2.")
raise ValueError(
"The number of column should be only 1 or 2, got columns: "
f"{split_file.mask_columns}"
)

return mask_df

Expand Down
1 change: 1 addition & 0 deletions graphstorm-processing/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pre-commit = "^3.3.3"
types-mock = "^5.1.0.1"
pylint = "~2.17.5"
diff-cover = "^9.0.0"
pytest-cov = "^6.0.0"
thvasilo marked this conversation as resolved.
Show resolved Hide resolved

[project]
requires-python = ">=3.9" # TODO: Do we need a tilde here?
Expand Down
34 changes: 29 additions & 5 deletions graphstorm-processing/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
@pytest.fixture(name="converter")
def fixture_create_converter() -> GConstructConfigConverter:
"""Creates a new converter object for each test."""
yield GConstructConfigConverter()
return GConstructConfigConverter()
jalencato marked this conversation as resolved.
Show resolved Hide resolved


@pytest.fixture(name="node_dict")
Expand Down Expand Up @@ -65,8 +65,32 @@ def test_try_read_unsupported_feature(converter: GConstructConfigConverter, node
_ = converter.convert_nodes(node_dict["nodes"])


def test_custom_split_config_conversion(converter: GConstructConfigConverter):
"""Test custom split file config conversion"""
gconstruct_label_dicts = [
{
"label_col": "label",
"task_type": "classification",
"custom_split_filenames": {
"column": ["src", "dst"],
},
"label_stats_type": "frequency_cnt",
}
]

# Should raise when none of train/val/test are in the input
with pytest.raises(AssertionError):
converter._convert_label(gconstruct_label_dicts)

# Ensure single strings are converted to list of strings
gconstruct_label_dicts[0]["custom_split_filenames"]["train"] = "fake_file"
gsprocessing_label_dict = converter._convert_label(gconstruct_label_dicts)[0]

assert gsprocessing_label_dict["custom_split_filenames"]["train"] == ["fake_file"]


def test_try_read_invalid_gconstruct_config(converter: GConstructConfigConverter, node_dict: dict):
"""Custom Split Columns"""
"""Test various invalid input scenarios"""
node_dict["nodes"][0]["labels"] = [
{
"label_col": "label",
Expand Down Expand Up @@ -238,9 +262,9 @@ def test_read_node_gconstruct(converter: GConstructConfigConverter, node_dict: d
"column": "label",
"type": "classification",
"custom_split_filenames": {
"train": "customized_label/node_train_idx.parquet",
"valid": "customized_label/node_val_idx.parquet",
"test": "customized_label/node_test_idx.parquet",
"train": ["customized_label/node_train_idx.parquet"],
"valid": ["customized_label/node_val_idx.parquet"],
"test": ["customized_label/node_test_idx.parquet"],
"column": ["ID"],
},
}
Expand Down
24 changes: 12 additions & 12 deletions graphstorm-processing/tests/test_dist_heterogenous_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,9 +955,9 @@ def test_node_custom_label(spark, dghl_loader: DistHeterogeneousGraphLoader, tmp
"type": "classification",
"split_rate": {"train": 0.8, "val": 0.1, "test": 0.1},
"custom_split_filenames": {
"train": f"{tmp_path}/train.parquet",
"valid": f"{tmp_path}/val.parquet",
"test": f"{tmp_path}/test.parquet",
"train": [f"{tmp_path}/train.parquet"],
"valid": [f"{tmp_path}/val.parquet"],
"test": [f"{tmp_path}/test.parquet"],
"column": ["mask_id"],
},
}
Expand Down Expand Up @@ -1010,9 +1010,9 @@ def test_edge_custom_label(spark, dghl_loader: DistHeterogeneousGraphLoader, tmp
"column": "",
"type": "link_prediction",
"custom_split_filenames": {
"train": f"{tmp_path}/train.parquet",
"valid": f"{tmp_path}/val.parquet",
"test": f"{tmp_path}/test.parquet",
"train": [f"{tmp_path}/train.parquet"],
"valid": [f"{tmp_path}/val.parquet"],
"test": [f"{tmp_path}/test.parquet"],
"column": ["mask_src_id", "mask_dst_id"],
},
}
Expand Down Expand Up @@ -1061,9 +1061,9 @@ def test_node_custom_label_multitask(spark, dghl_loader: DistHeterogeneousGraphL
"type": "classification",
"split_rate": {"train": 0.8, "val": 0.1, "test": 0.1},
"custom_split_filenames": {
"train": f"{tmp_path}/train.parquet",
"valid": f"{tmp_path}/val.parquet",
"test": f"{tmp_path}/test.parquet",
"train": [f"{tmp_path}/train.parquet"],
"valid": [f"{tmp_path}/val.parquet"],
"test": [f"{tmp_path}/test.parquet"],
"column": ["mask_id"],
},
"mask_field_names": class_mask_names,
Expand Down Expand Up @@ -1173,9 +1173,9 @@ def test_edge_custom_label_multitask(spark, dghl_loader: DistHeterogeneousGraphL
"column": "",
"type": "link_prediction",
"custom_split_filenames": {
"train": f"{tmp_path}/train.parquet",
"valid": f"{tmp_path}/val.parquet",
"test": f"{tmp_path}/test.parquet",
"train": [f"{tmp_path}/train.parquet"],
"valid": [f"{tmp_path}/val.parquet"],
"test": [f"{tmp_path}/test.parquet"],
"column": ["mask_src_id", "mask_dst_id"],
},
"mask_field_names": class_mask_names,
Expand Down
Loading
Loading