Skip to content

Commit

Permalink
Review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
thvasilo committed Oct 24, 2023
1 parent aa1013d commit 8ac1c7b
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 19 deletions.
13 changes: 6 additions & 7 deletions docs/source/gs-processing/developer/input-configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,8 @@ following top-level keys:
"files": ["String"],
"separator": "String"
},
"source": {"column": "String", "type": "String"},
"relation": {"type": "String"},
"destination": {"column": "String", "type": "String"},
"column": "String",
"type": "String",
"labels" : [
{
"column": "String",
Expand All @@ -207,15 +206,15 @@ following top-level keys:
}
}
],
"features": [{}]
"features": [{}]
}
- ``data``: (JSON object, required): Has the same definition as for
the edges object, with one top-level key for the ``format`` that
takes a String value, and one for the ``files`` that takes an array
of String values.
- ``column``: (String, required): The column in the data that
corresponds to the column that stores the node ids.
- ``column``: (String, required): The name of the column in the data that
stores the node ids.
- ``type:`` (String, optional): A type name for the nodes described
in this object. If not provided the ``column`` value is used as the
node type.
Expand Down Expand Up @@ -252,7 +251,7 @@ following top-level keys:
assign to the test set [0.0, 1.0).

- ``features`` (List of JSON objects, optional): Describes
the set of features for the current node type. See the next section, :ref:`features-object`
the set of features for the current node type. See the section :ref:`features-object`
for details.

--------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
from typing import Mapping

from graphstorm_processing.constants import VALID_IMPUTERS, VALID_NORMALIZERS
from .feature_config_base import FeatureConfig


Expand All @@ -27,6 +28,7 @@ class NumericalFeatureConfig(FeatureConfig):
A method to fill in missing values in the data. Valid values are:
"mean" (Default), "median", and "most_frequent". Missing values will be replaced
with the respective value computed from the data.
normalizer: str
A normalization to apply to each column. Valid values are
"none", "min-max", and "standard".
Expand All @@ -48,14 +50,12 @@ def __init__(self, config: Mapping):

def _sanity_check(self) -> None:
super()._sanity_check()
valid_imputers = ["mean", "median", "most_frequent"]
assert (
self.imputer in valid_imputers
), f"Unknown imputer requested, expected one of {valid_imputers}, got {self.imputer}"
valid_normalizers = ["none", "min-max", "standard"]
self.imputer in VALID_IMPUTERS
), f"Unknown imputer requested, expected one of {VALID_IMPUTERS}, got {self.imputer}"
assert (
self.norm in valid_normalizers
), f"Unknown normalizer requested, expected one of {valid_normalizers}, got {self.norm}"
self.norm in VALID_NORMALIZERS
), f"Unknown normalizer requested, expected one of {VALID_NORMALIZERS}, got {self.norm}"


class MultiNumericalFeatureConfig(NumericalFeatureConfig):
Expand All @@ -67,6 +67,7 @@ class MultiNumericalFeatureConfig(NumericalFeatureConfig):
A method to fill in missing values in the data. Valid values are:
"mean" (Default), "median", and "most_frequent". Missing values will be replaced
with the respective value computed from the data.
normalizer: str
A normalization to apply to each column. Valid values are
"none", "min-max", and "standard".
Expand All @@ -77,6 +78,7 @@ class MultiNumericalFeatureConfig(NumericalFeatureConfig):
* "min-max": Normalize each value by subtracting the minimum value from it,
and then dividing it by the difference between the maximum value and the minimum.
* "standard": Normalize each value by dividing it by the sum of all the values.
separator: str, optional
A separator to use when splitting a delimited string into multiple numerical values
as a vector. Only applicable to CSV input. Example: for a separator `'|'` the CSV
Expand Down
4 changes: 4 additions & 0 deletions graphstorm-processing/graphstorm_processing/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@
EXECUTOR_MEM_INSTANCE_MEM_RATIO = 0.95
# Fraction of executor memory to be allocated as additional non-heap memory per process
EXECUTOR_MEM_OVERHEAD_RATIO = 0.1

################# Numerical transformations ################
VALID_IMPUTERS = ["mean", "median", "most_frequent"]
VALID_NORMALIZERS = ["none", "min-max", "standard"]
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,16 @@ def apply_imputation(cols: Sequence[str], shared_imputation: str, input_df: Data
Applies a single imputation to input dataframe, individually to each of the columns
provided in the cols argument.
"""
assert shared_imputation in [
valid_inner_imputers = [
"mean",
"median",
"mode",
"none",
], f"Unsupported imputation strategy requested: {shared_imputation}"
]
assert shared_imputation in valid_inner_imputers, (
f"Unsupported imputation strategy requested: {shared_imputation}, the supported "
f"strategies are : {valid_inner_imputers}"
)
if shared_imputation == "none":
imputed_df = input_df
else:
Expand Down Expand Up @@ -97,7 +101,10 @@ def single_vec_to_float(vec):
# TODO: See if it's possible to exclude NaN values from the sum
for col, val in col_sums.items():
if np.isinf(val) or np.isnan(val):
col_sums[col] = 0
raise RuntimeError(
"Missing values found in the data, cannot apply "
"normalization. Use an imputer in the transformation."
)
scaled_df = imputed_df.select(
[(F.col(c) / col_sums[f"sum({c})"]).alias(c) for c in cols] + other_cols
)
Expand Down Expand Up @@ -147,8 +154,8 @@ def get_transformation_name() -> str:


class DistMultiNumericalTransformation(DistNumericalTransformation):
"""Transformation to apply missing value imputation and various forms of normalization
to a multi-column numerical input, where the input is a string separated by a delimiter.
"""Transformation to apply missing value imputation and normalization
to a multi-column numerical input.
Parameters
----------
Expand Down Expand Up @@ -270,7 +277,7 @@ def vector_df_has_nan(vector_df: DataFrame, vector_col: str) -> bool:
# Convert the input column from either a delimited string or array to a Vector column
multi_col_type = input_df.schema.jsonValue()["fields"][0]["type"]
if multi_col_type == "string":
assert self.separator
assert self.separator, "Separator needed when dealing with CSV multi-column data."
vector_df = convert_multistring_to_vector_df(input_df, self.separator)
else:
vector_df = input_df.select(
Expand Down

0 comments on commit 8ac1c7b

Please sign in to comment.