awslabs · jalencato · Feb 1, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 11, 2024
diff --git a/docs/source/gs-processing/developer/input-configuration.rst b/docs/source/gs-processing/developer/input-configuration.rst
@@ -447,6 +447,15 @@ arguments.
         will be considered as an array. For Parquet files, if the input type is ArrayType(StringType()), then the
         separator is ignored; if it is StringType(), it will apply same logic as in CSV.
 
+-  ``huggingface``
+
+   -  Transforms a text feature column to tokens or embeddings with different Hugging Face models, enabling nuanced understanding and processing of natural language data.
+   -  ``kwargs``:
+
+      - ``action`` (String, required): The action to perform on the text data. Currently we only support text tokenization through HuggingFace models, so the only accepted value here is "tokenize_hf".
+      - ``bert_model`` (String, required): It should be the identifier of a pre-trained model available in the Hugging Face Model Hub.
+      - ``max_seq_length`` (Integer, required): It specifies the maximum number of tokens of the input.
+
 --------------
 
 Creating a graph for inference

diff --git a/docs/source/gs-processing/usage/distributed-processing-setup.rst b/docs/source/gs-processing/usage/distributed-processing-setup.rst
@@ -111,6 +111,16 @@ The script also supports other arguments to customize the image name,
 tag and other aspects of the build. See ``bash docker/build_gsprocessing_image.sh --help``
 for more information.
 
+For EMR Serverless images, setting up a VPC and NAT route is a necessary step when using text data feature transformation.
+You can find detailed instructions on creating a VPC for EMR Serverless in the AWS documentation: `Create a VPC on emr-serverless
+<https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/vpc-access.html>`_.
+Alternatively, there is one easier way to do that, you can opt to include the huggingface model cache directly in your Docker image.
+The build_gsprocessing_image.sh script provides an option to embed the huggingface bert model cache within the Docker image.
+
+.. code-block:: bash
+
+    bash docker/build_gsprocessing_image.sh --environment emr-serverless --model-name bert-base-uncased
+
 Support for arm64 architecture
 ------------------------------
 
@@ -157,7 +167,7 @@ To build an EMR Serverless GSProcessing image for the ``arm64`` architecture you
 
 .. code-block:: bash
 
-    bash docker/build_gsprocessing_image.sh --environment sagemaker --architecture arm64
+    bash docker/build_gsprocessing_image.sh --environment emr-serverless --architecture arm64
 
 .. note::
 

diff --git a/docs/source/gs-processing/usage/emr-serverless.rst b/docs/source/gs-processing/usage/emr-serverless.rst
@@ -98,6 +98,9 @@ Here you will need to replace ``<aws-account-id>``, ``<arch>`` (``x86_64`` or ``
 from the image you just created. GSProcessing version ``0.2.1`` uses ``emr-6.13.0`` as its
 base image, so we need to ensure our application uses the same release.
 
+Additionally, if it is required to use text feature transformation, it is suggested to setup VPC and NAT route for the emr cluster:
+`Create a VPC on emr-serverless
+<https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/vpc-access.html>`_
 
 Allow EMR Serverless to access the custom image repository
 ----------------------------------------------------------

diff --git a/graphstorm-processing/docker/0.2.1/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.2.1/emr-serverless/Dockerfile.cpu
@@ -47,6 +47,16 @@ RUN pip install -r /usr/lib/spark/code/requirements.txt \
 # GSProcessing codebase
 COPY code/ /usr/lib/spark/code/
 
+# Install Hugging Face model cache if it is necessary
+ARG MODEL=""
+ENV TRANSFORMERS_CACHE=/home/hadoop/.cache/huggingface/hub
+RUN if [ $MODEL == "" ]; then \
+        echo "Skip installing model cache"; \
+else \
+        echo "Installing model cache for $MODEL" && \
+        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
+fi
+
 FROM runtime AS prod
 RUN python -m pip install --no-deps /usr/lib/spark/code/graphstorm_processing-*.whl && \
     rm /usr/lib/spark/code/graphstorm_processing-*.whl && rm -rf /root/.cache

diff --git a/graphstorm-processing/docker/build_gsprocessing_image.sh b/graphstorm-processing/docker/build_gsprocessing_image.sh
@@ -23,7 +23,9 @@ Available options:
 -i, --image         Docker image name, default is 'graphstorm-processing'.
 -v, --version       Docker version tag, default is the library's current version (`poetry version --short`)
 -s, --suffix        Suffix for the image tag, can be used to push custom image tags. Default is "".
--b, --build         Docker build directory, default is '/tmp/`
+-b, --build         Docker build directory, default is '/tmp/'.
+-m, --model         Huggingface Model name that needs to be packed into the docker image. Default is "".
+
 EOF
   exit
 }
@@ -48,6 +50,7 @@ parse_params() {
   TARGET='test'
   ARCH='x86_64'
   SUFFIX=""
+  MODEL=""
 
   while :; do
     case "${1-}" in
@@ -86,6 +89,10 @@ parse_params() {
       SUFFIX="${2-}"
       shift
       ;;
+    -m | --MODEL)
+      MODEL="${2-}"
+      shift
+      ;;
     -?*) die "Unknown option: $1" ;;
     *) break ;;
     esac
@@ -135,6 +142,7 @@ msg "- GSP_HOME: ${GSP_HOME}"
 msg "- IMAGE_NAME: ${IMAGE_NAME}"
 msg "- VERSION: ${VERSION}"
 msg "- SUFFIX: ${SUFFIX}"
+msg "- MODEL: ${MODEL}"
 
 # Prepare Docker build directory
 rm -rf "${BUILD_DIR}/docker/code"
@@ -170,4 +178,4 @@ fi
 
 echo "Build a Docker image ${DOCKER_FULLNAME}"
 DOCKER_BUILDKIT=1 docker build --platform "linux/${ARCH}" -f "${GSP_HOME}/docker/${VERSION}/${EXEC_ENV}/Dockerfile.cpu" \
-    "${BUILD_DIR}/docker/" -t $DOCKER_FULLNAME --target ${TARGET} --build-arg ARCH=${ARCH}
+    "${BUILD_DIR}/docker/" -t $DOCKER_FULLNAME --target ${TARGET} --build-arg ARCH=${ARCH} --build-arg MODEL=${MODEL}
diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
@@ -13,6 +13,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
 from typing import Any
 
 from .converter_base import ConfigConverter
@@ -134,6 +135,13 @@ def _convert_feature(feats: list[dict]) -> list[dict]:
                     else:
                         gsp_transformation_dict["name"] = "categorical"
                         gsp_transformation_dict["kwargs"] = {}
+                elif gconstruct_transform_dict["name"] == "tokenize_hf":
+                    gsp_transformation_dict["name"] = "huggingface"
+                    gsp_transformation_dict["kwargs"] = {
+                        "action": "tokenize_hf",
+                        "bert_model": gconstruct_transform_dict["bert_model"],
+                        "max_seq_length": gconstruct_transform_dict["max_seq_length"],
+                    }
                 # TODO: Add support for other common transformations here
                 else:
                     raise ValueError(

diff --git a/graphstorm-processing/graphstorm_processing/config/config_parser.py b/graphstorm-processing/graphstorm_processing/config/config_parser.py
@@ -27,6 +27,7 @@
     NumericalFeatureConfig,
 )
 from .categorical_configs import MultiCategoricalFeatureConfig
+from .hf_configs import HFConfig
 from .data_config_base import DataStorageConfig
 
 
@@ -67,6 +68,8 @@ def parse_feat_config(feature_dict: Dict) -> FeatureConfig:
         return FeatureConfig(feature_dict)
     elif transformation_name == "multi-categorical":
         return MultiCategoricalFeatureConfig(feature_dict)
+    elif transformation_name == "huggingface":
+        return HFConfig(feature_dict)
     else:
         raise RuntimeError(f"Unknown transformation name: '{transformation_name}'")
 

diff --git a/graphstorm-processing/graphstorm_processing/config/hf_configs.py b/graphstorm-processing/graphstorm_processing/config/hf_configs.py
@@ -0,0 +1,53 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Mapping
+
+from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE
+from .feature_config_base import FeatureConfig
+
+
+class HFConfig(FeatureConfig):
+    """Feature configuration for huggingface text features.
+
+    Supported kwargs
+    ----------------
+    action: str, required
+        The type of huggingface action to use. Valid values is "tokenize_hf"
+    bert_model: str, required
+        The name of the lm model.
+    max_seq_length: int, required
+        The maximal length of the tokenization results.
+    """
+
+    def __init__(self, config: Mapping):
+        super().__init__(config)
+        self.action = self._transformation_kwargs.get("action")
+        self.bert_model = self._transformation_kwargs.get("bert_model")
+        self.max_seq_length = self._transformation_kwargs.get("max_seq_length")
+
+        self._sanity_check()
+
+    def _sanity_check(self) -> None:
+        super()._sanity_check()
+        assert self.action in [HUGGINGFACE_TOKENIZE], \
+            f"huggingface action needs to be {HUGGINGFACE_TOKENIZE}"
+        assert isinstance(
+            self.bert_model, str
+        ), f"Expect bert_model to be a string, but got {self.bert_model}"
+        assert (
+            isinstance(self.max_seq_length, int) and self.max_seq_length > 0
+        ), f"Expect max_seq_length {self.max_seq_length} be an integer and larger than zero."
diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
@@ -13,6 +13,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
 ################### Categorical Limits #######################
 MAX_CATEGORIES_PER_FEATURE = 100
 RARE_CATEGORY = "GSP_CONSTANT_OTHER"
@@ -43,3 +44,7 @@
 ################# Numerical transformations  ################
 VALID_IMPUTERS = ["none", "mean", "median", "most_frequent"]
 VALID_NORMALIZERS = ["none", "min-max", "standard", "rank-gauss"]
+
+################# Bert transformations  ################
+HUGGINGFACE_TRANFORM = "huggingface"
+HUGGINGFACE_TOKENIZE = "tokenize_hf"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -13,6 +13,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
 import logging
 
 from pyspark.sql import DataFrame
@@ -26,6 +27,7 @@
     DistBucketNumericalTransformation,
     DistCategoryTransformation,
     DistMultiCategoryTransformation,
+    DistHFTransformation,
 )
 
 
@@ -57,6 +59,8 @@ def __init__(self, feature_config: FeatureConfig):
             self.transformation = DistCategoryTransformation(**default_kwargs, **args_dict)
         elif feat_type == "multi-categorical":
             self.transformation = DistMultiCategoryTransformation(**default_kwargs, **args_dict)
+        elif feat_type == "huggingface":
+            self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         else:
             raise NotImplementedError(
                 f"Feature {feat_name} has type: {feat_type} that is not supported"

diff --git a/...rm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py b/...rm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
@@ -13,3 +13,4 @@
     DistNumericalTransformation,
 )
 from .dist_bucket_numerical_transformation import DistBucketNumericalTransformation
+from .dist_hf_transformation import DistHFTransformation
diff --git a/...graphstorm_processing/data_transformations/dist_transformations/dist_hf_transformation.py b/...graphstorm_processing/data_transformations/dist_transformations/dist_hf_transformation.py
@@ -0,0 +1,131 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Sequence
+import numpy as np
+from pyspark.sql import DataFrame
+from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
+from pyspark.sql.functions import udf
+from transformers import AutoTokenizer
+
+from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE
+from .base_dist_transformation import DistributedTransformation
+
+
+def apply_transform(
+    cols: Sequence[str], action: str, bert_model: str, max_seq_length: int, input_df: DataFrame
+) -> DataFrame:
+    """Applies a single normalizer to the imputed dataframe, individually to each of the columns
+    provided in the cols argument.
+
+    Parameters
+    ----------
+    cols : Sequence[str]
+        List of column names to apply normalization to.
+    action : str
+        The type of normalization to use. Valid values is "tokenize"
+    bert_model : str
+        The name of huggingface model.
+    max_seq_length: int
+        The maximal length of the tokenization results.
+    input_df : DataFrame
+        The input DataFrame to apply normalization to.
+    """
+
+    if action == HUGGINGFACE_TOKENIZE:
+        # Initialize the tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(bert_model)
+
+        # Define the schema of your return type
+        schema = StructType(
+            [
+                StructField("input_ids", ArrayType(IntegerType())),
+                StructField("attention_mask", ArrayType(IntegerType())),
+                StructField("token_type_ids", ArrayType(IntegerType())),
+            ]
+        )
+
+        # Define UDF
+        @udf(returnType=schema)
+        def tokenize(text):
+            # Check if text is a string
+            if not isinstance(text, str):
+                raise ValueError("The input of the tokenizer has to be a string.")
+
+            # Tokenize the text
+            t = tokenizer(
+                text,
+                max_length=max_seq_length,
+                truncation=True,
+                padding="max_length",
+                return_tensors="np",
+            )
+            token_type_ids = t.get("token_type_ids", np.zeros_like(t["input_ids"], dtype=np.int8))
+            result = (
+                t["input_ids"][0].tolist(),  # Convert tensor to list
+                t["attention_mask"][0].astype(np.int8).tolist(),
+                token_type_ids[0].astype(np.int8).tolist(),
+            )
+
+            return result
+
+        # Apply the UDF to the DataFrame
+        transformed_df = input_df.withColumn(cols[0], tokenize(input_df[cols[0]]))
+        transformed_df = transformed_df.select(
+            transformed_df[cols[0]].getItem("input_ids").alias("input_ids"),
+            transformed_df[cols[0]].getItem("attention_mask").alias("attention_mask"),
+            transformed_df[cols[0]].getItem("token_type_ids").alias("token_type_ids"),
+        )
+    else:
+        raise ValueError(f"The input action needs to be {HUGGINGFACE_TOKENIZE}")
+
+    return transformed_df
+
+
+class DistHFTransformation(DistributedTransformation):
+    """Transformation to apply various forms of bert normalization to a text input.
+
+    Parameters
+    ----------
+    cols : Sequence[str]
+        List of column names to apply normalization to.
+    action : str
+        The type of huggingface action to use. Valid values is "tokenize"
+    bert_model: str, required
+        The name of the lm model.
+    max_seq_length: int, required
+        The maximal length of the tokenization results.
+    """
+
+    def __init__(
+        self, cols: Sequence[str], action: str, bert_model: str, max_seq_length: int
+    ) -> None:
+        super().__init__(cols)
+        self.cols = cols
+        assert len(self.cols) == 1, "Huggingface transformation only supports single column"
+        self.action = action
+        self.bert_model = bert_model
+        self.max_seq_length = max_seq_length
+
+    def apply(self, input_df: DataFrame) -> DataFrame:
+        transformed_df = apply_transform(
+            self.cols, self.action, self.bert_model, self.max_seq_length, input_df
+        )
+
+        return transformed_df
+
+    @staticmethod
+    def get_transformation_name() -> str:
+        return "DistHFTransformation"