Review comments

awslabs · Oct 24, 2023 · 8ac1c7b · 8ac1c7b
1 parent aa1013d
commit 8ac1c7b
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 19 deletions.
diff --git a/docs/source/gs-processing/developer/input-configuration.rst b/docs/source/gs-processing/developer/input-configuration.rst
@@ -193,9 +193,8 @@ following top-level keys:
             "files": ["String"],
             "separator": "String"
         },
-        "source": {"column": "String", "type": "String"},
-        "relation": {"type": "String"},
-        "destination": {"column": "String", "type": "String"},
+        "column": "String",
+        "type": "String",
         "labels" : [
                 {
                     "column": "String",
@@ -207,15 +206,15 @@ following top-level keys:
                     }
                 }
             ],
-            "features": [{}]
+        "features": [{}]
     }
 
 -  ``data``: (JSON object, required): Has the same definition as for
    the edges object, with one top-level key for the ``format`` that
    takes a String value, and one for the ``files`` that takes an array
    of String values.
--  ``column``: (String, required): The column in the data that
-   corresponds to the column that stores the node ids.
+-  ``column``: (String, required): The name of the column in the data that
+   stores the node ids.
 -  ``type:`` (String, optional): A type name for the nodes described
    in this object. If not provided the ``column`` value is used as the
    node type.
@@ -252,7 +251,7 @@ following top-level keys:
          assign to the test set [0.0, 1.0).
 
 -  ``features`` (List of JSON objects, optional): Describes
-   the set of features for the current node type. See the next section, :ref:`features-object`
+   the set of features for the current node type. See the section :ref:`features-object`
    for details.
 
 --------------

diff --git a/graphstorm-processing/graphstorm_processing/config/numerical_configs.py b/graphstorm-processing/graphstorm_processing/config/numerical_configs.py
@@ -15,6 +15,7 @@
 """
 from typing import Mapping
 
+from graphstorm_processing.constants import VALID_IMPUTERS, VALID_NORMALIZERS
 from .feature_config_base import FeatureConfig
 
 
@@ -27,6 +28,7 @@ class NumericalFeatureConfig(FeatureConfig):
         A method to fill in missing values in the data. Valid values are:
         "mean" (Default), "median", and "most_frequent". Missing values will be replaced
         with the respective value computed from the data.
+
     normalizer: str
         A normalization to apply to each column. Valid values are
         "none", "min-max", and "standard".
@@ -48,14 +50,12 @@ def __init__(self, config: Mapping):
 
     def _sanity_check(self) -> None:
         super()._sanity_check()
-        valid_imputers = ["mean", "median", "most_frequent"]
         assert (
-            self.imputer in valid_imputers
-        ), f"Unknown imputer requested, expected one of {valid_imputers}, got {self.imputer}"
-        valid_normalizers = ["none", "min-max", "standard"]
+            self.imputer in VALID_IMPUTERS
+        ), f"Unknown imputer requested, expected one of {VALID_IMPUTERS}, got {self.imputer}"
         assert (
-            self.norm in valid_normalizers
-        ), f"Unknown normalizer requested, expected one of {valid_normalizers}, got {self.norm}"
+            self.norm in VALID_NORMALIZERS
+        ), f"Unknown normalizer requested, expected one of {VALID_NORMALIZERS}, got {self.norm}"
 
 
 class MultiNumericalFeatureConfig(NumericalFeatureConfig):
@@ -67,6 +67,7 @@ class MultiNumericalFeatureConfig(NumericalFeatureConfig):
         A method to fill in missing values in the data. Valid values are:
         "mean" (Default), "median", and "most_frequent". Missing values will be replaced
         with the respective value computed from the data.
+
     normalizer: str
         A normalization to apply to each column. Valid values are
         "none", "min-max", and "standard".
@@ -77,6 +78,7 @@ class MultiNumericalFeatureConfig(NumericalFeatureConfig):
         * "min-max": Normalize each value by subtracting the minimum value from it,
         and then dividing it by the difference between the maximum value and the minimum.
         * "standard": Normalize each value by dividing it by the sum of all the values.
+
     separator: str, optional
         A separator to use when splitting a delimited string into multiple numerical values
         as a vector. Only applicable to CSV input. Example: for a separator `'|'` the CSV

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
@@ -39,3 +39,7 @@
 EXECUTOR_MEM_INSTANCE_MEM_RATIO = 0.95
 # Fraction of executor memory to be allocated as additional non-heap memory per process
 EXECUTOR_MEM_OVERHEAD_RATIO = 0.1
+
+################# Numerical transformations  ################
+VALID_IMPUTERS = ["mean", "median", "most_frequent"]
+VALID_NORMALIZERS = ["none", "min-max", "standard"]
diff --git a/...orm_processing/data_transformations/dist_transformations/dist_numerical_transformation.py b/...orm_processing/data_transformations/dist_transformations/dist_numerical_transformation.py
@@ -37,12 +37,16 @@ def apply_imputation(cols: Sequence[str], shared_imputation: str, input_df: Data
     Applies a single imputation to input dataframe, individually to each of the columns
     provided in the cols argument.
     """
-    assert shared_imputation in [
+    valid_inner_imputers = [
         "mean",
         "median",
         "mode",
         "none",
-    ], f"Unsupported imputation strategy requested: {shared_imputation}"
+    ]
+    assert shared_imputation in valid_inner_imputers, (
+        f"Unsupported imputation strategy requested: {shared_imputation}, the supported "
+        f"strategies are : {valid_inner_imputers}"
+    )
     if shared_imputation == "none":
         imputed_df = input_df
     else:
@@ -97,7 +101,10 @@ def single_vec_to_float(vec):
         # TODO: See if it's possible to exclude NaN values from the sum
         for col, val in col_sums.items():
             if np.isinf(val) or np.isnan(val):
-                col_sums[col] = 0
+                raise RuntimeError(
+                    "Missing values found in the data, cannot apply "
+                    "normalization. Use an imputer in the transformation."
+                )
         scaled_df = imputed_df.select(
             [(F.col(c) / col_sums[f"sum({c})"]).alias(c) for c in cols] + other_cols
         )
@@ -147,8 +154,8 @@ def get_transformation_name() -> str:
 
 
 class DistMultiNumericalTransformation(DistNumericalTransformation):
-    """Transformation to apply missing value imputation and various forms of normalization
-     to a multi-column numerical input, where the input is a string separated by a delimiter.
+    """Transformation to apply missing value imputation and normalization
+     to a multi-column numerical input.
 
     Parameters
     ----------
@@ -270,7 +277,7 @@ def vector_df_has_nan(vector_df: DataFrame, vector_col: str) -> bool:
         # Convert the input column from either a delimited string or array to a Vector column
         multi_col_type = input_df.schema.jsonValue()["fields"][0]["type"]
         if multi_col_type == "string":
-            assert self.separator
+            assert self.separator, "Separator needed when dealing with CSV multi-column data."
             vector_df = convert_multistring_to_vector_df(input_df, self.separator)
         else:
             vector_df = input_df.select(