From 84babde04eb6e6ad53bf58f9852b22647e24d2ab Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-2-209.us-west-2.compute.internal>
Date: Tue, 29 Oct 2024 22:06:44 +0000
Subject: [PATCH 01/50] update gconstruct converter

---
 .../config_conversion/gconstruct_converter.py    |  7 +++++++
 graphstorm-processing/tests/test_converter.py    | 16 ++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
index 5129254538..ce615e7a6a 100644
--- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
+++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
@@ -188,6 +188,13 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]:
                         "hf_model": gconstruct_transform_dict["bert_model"],
                         "max_seq_length": gconstruct_transform_dict["max_seq_length"],
                     }
+                elif gconstruct_transform_dict["name"] == "edge_dst_hard_negative":
+                    # Not check if it is link prediction task here
+                    gsp_transformation_dict["name"] = "edge_dst_hard_negative"
+                    if "separator" in gconstruct_transform_dict:
+                        gsp_transformation_dict["kwargs"] = {
+                            "separator": gconstruct_transform_dict["separator"],
+                        }
                 else:
                     raise ValueError(
                         "Unsupported GConstruct transformation name: "
diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py
index 334ef72284..30b04f3855 100644
--- a/graphstorm-processing/tests/test_converter.py
+++ b/graphstorm-processing/tests/test_converter.py
@@ -401,7 +401,14 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter):
             "files": ["/tmp/acm_raw/edges/author_writing_paper.parquet"],
             "source_id_col": "~from",
             "dest_id_col": "~to",
-            "features": [{"feature_col": ["author"], "feature_name": "feat"}],
+            "features": [
+                {"feature_col": ["author"], "feature_name": "feat"},
+                {
+                    "feature_col": ["author"],
+                    "feature_name": "hard_negative",
+                    "transform": {"name": "edge_dst_hard_negative"},
+                },
+            ],
             "labels": [
                 {
                     "label_col": "edge_col",
@@ -505,7 +512,12 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter):
     assert edges_output["dest"] == {"column": "~to", "type": "paper"}
     assert edges_output["relation"] == {"type": "writing"}
     assert edges_output["features"] == [
-        {"column": "author", "transformation": {"name": "no-op"}, "name": "feat"}
+        {"column": "author", "transformation": {"name": "no-op"}, "name": "feat"},
+        {
+            "column": "author",
+            "name": "hard_negative",
+            "transformation": {"name": "edge_dst_hard_negative"},
+        },
     ]
     assert edges_output["labels"] == [
         {

From ba88b9f41bb4617cc448abd6bca8151c85fcc38c Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 30 Oct 2024 21:36:03 +0000
Subject: [PATCH 02/50] gsprocessing part

---
 .../config/config_parser.py                   |  3 +
 .../config/hard_negative_configs.py           | 42 +++++++++++
 .../dist_feature_transformer.py               |  3 +
 .../dist_hard_negative_transformation.py      | 73 +++++++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
 create mode 100644 graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py

diff --git a/graphstorm-processing/graphstorm_processing/config/config_parser.py b/graphstorm-processing/graphstorm_processing/config/config_parser.py
index 38e92528d8..15f323ced8 100644
--- a/graphstorm-processing/graphstorm_processing/config/config_parser.py
+++ b/graphstorm-processing/graphstorm_processing/config/config_parser.py
@@ -29,6 +29,7 @@
 )
 from .categorical_configs import MultiCategoricalFeatureConfig
 from .hf_configs import HFConfig
+from .hard_negative_configs import HardNegativeConfig
 from .data_config_base import DataStorageConfig
 
 
@@ -71,6 +72,8 @@ def parse_feat_config(feature_dict: Dict) -> FeatureConfig:
         return MultiCategoricalFeatureConfig(feature_dict)
     elif transformation_name == "huggingface":
         return HFConfig(feature_dict)
+    elif transformation_name == "edge_dst_hard_negative":
+        return HardNegativeConfig(feature_dict)
     else:
         raise RuntimeError(f"Unknown transformation name: '{transformation_name}'")
 
diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
new file mode 100644
index 0000000000..c4924c57e0
--- /dev/null
+++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
@@ -0,0 +1,42 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Mapping
+
+from .feature_config_base import FeatureConfig
+
+
+class HardNegativeConfig(FeatureConfig):
+    """Feature configuration for hard negative feature. Now only support link prediction
+
+    Supported kwargs
+    ----------------
+    separator: str, optional
+        The separator for string input value. Only required when input value type is string.
+    """
+
+    def __init__(self, config: Mapping):
+        super().__init__(config)
+        self.separator = self._transformation_kwargs.get("separator")
+
+        self._sanity_check()
+
+    def _sanity_check(self) -> None:
+        super()._sanity_check()
+        assert self.action in [
+            HUGGINGFACE_TOKENIZE,
+            HUGGINGFACE_EMB,
+        ], f"huggingface action needs to be one of {HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB}"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index b68c3eeb96..57ba1b84de 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -28,6 +28,7 @@
     DistCategoryTransformation,
     DistMultiCategoryTransformation,
     DistHFTransformation,
+    DistHardNegativeTransformation,
 )
 
 
@@ -69,6 +70,8 @@ def __init__(
             self.transformation = DistMultiCategoryTransformation(**default_kwargs, **args_dict)
         elif feat_type == "huggingface":
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
+        elif feat_type == "edge_dst_hard_negative":
+            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict)
         else:
             raise NotImplementedError(
                 f"Feature {feat_name} has type: {feat_type} that is not supported"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
new file mode 100644
index 0000000000..cfc8211404
--- /dev/null
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -0,0 +1,73 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+import os
+from typing import Sequence
+import numpy as np
+import torch as th
+from pyspark.sql import DataFrame
+from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, StructField
+from pyspark.sql.functions import udf
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+
+from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB
+from .base_dist_transformation import DistributedTransformation
+
+
+def apply_transform(
+    cols: Sequence[str], separator: str, input_df: DataFrame
+) -> DataFrame:
+    """Applies hard negative transformation to each row.
+
+    Parameters
+    ----------
+    cols : Sequence[str]
+        List of column names to apply normalization to.
+    separator: str, optional
+        The separator for string input value. Only required when input value type is string.
+    """
+
+    return transformed_df
+
+
+class DistHardNegativeTransformation(DistributedTransformation):
+    """Transformation to apply hard negative transformation.
+
+    Parameters
+    ----------
+    separator: str, optional
+        The separator for string input value. Only required when input value type is string.
+    """
+
+    def __init__(
+        self, cols: Sequence[str], separator: str = "",
+    ) -> None:
+        super().__init__(cols)
+        self.cols = cols
+        assert len(self.cols) == 1, "Hard Negative Transformation only supports single column"
+        self.separator = separator
+
+    def apply(self, input_df: DataFrame) -> DataFrame:
+        transformed_df = apply_transform(
+            self.cols, self.separator, input_df
+        )
+
+        return transformed_df
+
+    @staticmethod
+    def get_transformation_name() -> str:
+        return "DistHardNegativeTransformation"

From 613814fe3fd1e8b7fae6899bee5cd9e4e26440d2 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 30 Oct 2024 21:41:30 +0000
Subject: [PATCH 03/50] hard negative config

---
 .../graphstorm_processing/config/hard_negative_configs.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
index c4924c57e0..00a99c8f1b 100644
--- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
+++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
@@ -30,13 +30,9 @@ class HardNegativeConfig(FeatureConfig):
 
     def __init__(self, config: Mapping):
         super().__init__(config)
-        self.separator = self._transformation_kwargs.get("separator")
+        self.separator = self._transformation_kwargs.get("separator", None)
 
         self._sanity_check()
 
     def _sanity_check(self) -> None:
-        super()._sanity_check()
-        assert self.action in [
-            HUGGINGFACE_TOKENIZE,
-            HUGGINGFACE_EMB,
-        ], f"huggingface action needs to be one of {HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB}"
+        super()._sanity_check()
\ No newline at end of file

From 53ea8007a4fd367992ef7ebcb7a60f1e3ea54584 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 30 Oct 2024 23:17:38 +0000
Subject: [PATCH 04/50] add code file

---
 .../dist_feature_transformer.py               |  6 ++++--
 .../dist_transformations/__init__.py          |  1 +
 .../dist_hard_negative_transformation.py      | 21 ++++++++++++-------
 .../dist_heterogeneous_loader.py              |  7 ++++++-
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index 57ba1b84de..f3dfeac509 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -39,7 +39,7 @@ class DistFeatureTransformer(object):
     """
 
     def __init__(
-        self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict
+        self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict, edge_mapping_dict: dict = None
     ):
         feat_type = feature_config.feat_type
         feat_name = feature_config.feat_name
@@ -47,6 +47,8 @@ def __init__(
         self.transformation: DistributedTransformation
         # We use this to re-apply transformations
         self.json_representation = json_representation
+        # Edge mapping file location
+        self.edge_mapping_dict = edge_mapping_dict
 
         default_kwargs = {
             "cols": feature_config.cols,
@@ -71,7 +73,7 @@ def __init__(
         elif feat_type == "huggingface":
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         elif feat_type == "edge_dst_hard_negative":
-            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict)
+            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, edge_mapping_dict=edge_mapping_dict)
         else:
             raise NotImplementedError(
                 f"Feature {feat_name} has type: {feat_type} that is not supported"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
index 4849c53acc..5c74d4928a 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
@@ -15,3 +15,4 @@
 )
 from .dist_bucket_numerical_transformation import DistBucketNumericalTransformation
 from .dist_hf_transformation import DistHFTransformation
+from .dist_hard_negative_transformation import DistHardNegativeTransformation
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index cfc8211404..34d08c975c 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -17,19 +17,14 @@
 import logging
 import os
 from typing import Sequence
-import numpy as np
-import torch as th
 from pyspark.sql import DataFrame
-from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, StructField
 from pyspark.sql.functions import udf
-from transformers import AutoTokenizer, AutoModel, AutoConfig
 
-from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB
 from .base_dist_transformation import DistributedTransformation
 
 
 def apply_transform(
-    cols: Sequence[str], separator: str, input_df: DataFrame
+    cols: Sequence[str], separator: str, input_df: DataFrame, edge_mapping_dict: dict
 ) -> DataFrame:
     """Applies hard negative transformation to each row.
 
@@ -39,8 +34,16 @@ def apply_transform(
         List of column names to apply normalization to.
     separator: str, optional
         The separator for string input value. Only required when input value type is string.
+    input_df : DataFrame
+        The input DataFrame to apply normalization to.
+    edge_mapping_dict: dict
+        The mapping dictionary contain mapping file directory and edge type
     """
 
+    input_df.show()
+    print(edge_mapping_dict)
+    exit(-1)
+
     return transformed_df
 
 
@@ -54,16 +57,18 @@ class DistHardNegativeTransformation(DistributedTransformation):
     """
 
     def __init__(
-        self, cols: Sequence[str], separator: str = "",
+        self, cols: Sequence[str], separator: str = "", edge_mapping_dict=None
     ) -> None:
         super().__init__(cols)
         self.cols = cols
         assert len(self.cols) == 1, "Hard Negative Transformation only supports single column"
         self.separator = separator
+        self.edge_mapping_dict = edge_mapping_dict
+        assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative "
 
     def apply(self, input_df: DataFrame) -> DataFrame:
         transformed_df = apply_transform(
-            self.cols, self.separator, input_df
+            self.cols, self.separator, input_df, self.edge_mapping_dict
         )
 
         return transformed_df
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
index db56720840..1e86edebf4 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -1654,7 +1654,12 @@ def _process_edge_features(
                 .get(edge_type, {})
                 .get(feat_conf.feat_name, {})
             )
-            transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation)
+            # Hard Negative Transformation use case, but should be able to be reused
+            edge_mapping_dict = {
+                "edge_type": edge_type,
+                "mapping_path": f"{self.output_prefix}/raw_id_mappings/"
+            }
+            transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict)
 
             if json_representation:
                 logging.info(

From 9e9d35e77ed09e903599b07c8876905f7a4fb0d2 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Thu, 31 Oct 2024 22:17:59 +0000
Subject: [PATCH 05/50] finish gsprocessing related development

---
 .../graphstorm_processing/constants.py        |  3 ++
 .../dist_feature_transformer.py               |  3 +-
 .../dist_hard_negative_transformation.py      | 50 +++++++++++++++----
 .../dist_heterogeneous_loader.py              |  3 +-
 .../graph_loaders/schema_utils.py             |  1 -
 5 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
index cbb48f4c02..fc27b17686 100644
--- a/graphstorm-processing/graphstorm_processing/constants.py
+++ b/graphstorm-processing/graphstorm_processing/constants.py
@@ -58,6 +58,9 @@
 HUGGINGFACE_TOKENIZE = "tokenize_hf"
 HUGGINGFACE_EMB = "embedding_hf"
 
+################# Node Mapping  ################
+NODE_MAPPING_STR = "orig"
+NODE_MAPPING_INT = "new"
 
 ################# Supported execution envs  ##############
 class ExecutionEnv(Enum):
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index f3dfeac509..a9c0a14ae5 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -73,7 +73,8 @@ def __init__(
         elif feat_type == "huggingface":
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         elif feat_type == "edge_dst_hard_negative":
-            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, edge_mapping_dict=edge_mapping_dict)
+            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict,
+                                                                 spark=spark, edge_mapping_dict=edge_mapping_dict)
         else:
             raise NotImplementedError(
                 f"Feature {feat_name} has type: {feat_type} that is not supported"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 34d08c975c..2b859112a6 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -17,14 +17,16 @@
 import logging
 import os
 from typing import Sequence
-from pyspark.sql import DataFrame
-from pyspark.sql.functions import udf
+from pyspark.sql.functions import udf, split, col
+from pyspark.sql.types import ArrayType, IntegerType, StringType
+from pyspark.sql import DataFrame, functions as F, SparkSession
 
 from .base_dist_transformation import DistributedTransformation
 
+from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
 
 def apply_transform(
-    cols: Sequence[str], separator: str, input_df: DataFrame, edge_mapping_dict: dict
+    cols: Sequence[str], separator: str, spark: SparkSession, input_df: DataFrame, edge_mapping_dict: dict
 ) -> DataFrame:
     """Applies hard negative transformation to each row.
 
@@ -39,11 +41,32 @@ def apply_transform(
     edge_mapping_dict: dict
         The mapping dictionary contain mapping file directory and edge type
     """
-
-    input_df.show()
-    print(edge_mapping_dict)
-    exit(-1)
-
+    column_type = input_df.schema[cols[0]].dataType
+    if isinstance(column_type, StringType):
+        transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator))
+    else:
+        transformed_df = input_df
+    # Edge type should be (src_ntype:get_relation_name()}:dst_ntype)
+    # Assume all the node type in the hard negative feature should be dst node type
+    _, _, dst_type = edge_mapping_dict["edge_type"].split(":")
+    mapping_prefix = edge_mapping_dict["mapping_path"]
+    format_name = edge_mapping_dict["format_name"]
+    hard_negative_node_mapping = spark.read.parquet(f"{mapping_prefix}{dst_type}/{format_name}/*.parquet")
+    node_mapping_length = hard_negative_node_mapping.count()
+
+    # TODO: This method may suffer from scalability issue, we can make this method to join-based solution.
+    hard_negative_node_mapping_dict = {row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()}
+
+    # Same length for feature to convert to tensor
+    def map_values(hard_neg_list):
+        mapped_values = [hard_negative_node_mapping_dict.get(item, -1) for item in hard_neg_list]
+        while len(mapped_values) < node_mapping_length:
+            mapped_values.append(-1)
+        return mapped_values
+
+    map_values_udf = F.udf(map_values, ArrayType(IntegerType()))
+
+    transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0]))
     return transformed_df
 
 
@@ -54,12 +77,16 @@ class DistHardNegativeTransformation(DistributedTransformation):
     ----------
     separator: str, optional
         The separator for string input value. Only required when input value type is string.
+    spark: SparkSession
+        The spark session
+    edge_mapping_dict: dict
+        The node type and mapping directory
     """
 
     def __init__(
-        self, cols: Sequence[str], separator: str = "", edge_mapping_dict=None
+        self, cols: Sequence[str], spark: SparkSession, separator: str = "", edge_mapping_dict=None
     ) -> None:
-        super().__init__(cols)
+        super().__init__(cols, spark)
         self.cols = cols
         assert len(self.cols) == 1, "Hard Negative Transformation only supports single column"
         self.separator = separator
@@ -67,8 +94,9 @@ def __init__(
         assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative "
 
     def apply(self, input_df: DataFrame) -> DataFrame:
+        assert self.spark
         transformed_df = apply_transform(
-            self.cols, self.separator, input_df, self.edge_mapping_dict
+            self.cols, self.separator, self.spark, input_df, self.edge_mapping_dict
         )
 
         return transformed_df
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
index 1e86edebf4..ef3d5c00c2 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -1657,7 +1657,8 @@ def _process_edge_features(
             # Hard Negative Transformation use case, but should be able to be reused
             edge_mapping_dict = {
                 "edge_type": edge_type,
-                "mapping_path": f"{self.output_prefix}/raw_id_mappings/"
+                "mapping_path": f"{self.output_prefix}/raw_id_mappings/",
+                "format_name": FORMAT_NAME
             }
             transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict)
 
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
index 19ed03869d..ae9b22a397 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
@@ -65,7 +65,6 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc
             if StructField(feature_col, spark_feature_type(), True) in field_list:
                 continue
             field_list.append(StructField(feature_col, spark_feature_type(), True))
-
     return field_list
 
 

From 37e30e62636afea481c88d5bfeb5a9df42babc0a Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Thu, 31 Oct 2024 22:19:26 +0000
Subject: [PATCH 06/50] add blank

---
 .../graphstorm_processing/graph_loaders/schema_utils.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
index ae9b22a397..b65a5d92dc 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
@@ -65,6 +65,7 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc
             if StructField(feature_col, spark_feature_type(), True) in field_list:
                 continue
             field_list.append(StructField(feature_col, spark_feature_type(), True))
+            
     return field_list
 
 

From 8b702e51acf463ac767963bbb7570fdaf1bcbd1f Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Thu, 31 Oct 2024 22:20:01 +0000
Subject: [PATCH 07/50] tab

---
 .../graphstorm_processing/graph_loaders/schema_utils.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
index b65a5d92dc..19ed03869d 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py
@@ -65,7 +65,7 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc
             if StructField(feature_col, spark_feature_type(), True) in field_list:
                 continue
             field_list.append(StructField(feature_col, spark_feature_type(), True))
-            
+
     return field_list
 
 

From a50b657d0cc707f3c18424affa788605946fbbc5 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Fri, 1 Nov 2024 00:06:58 +0000
Subject: [PATCH 08/50] hard negative for gspartition

---
 python/graphstorm/gpartition/__init__.py      |  1 +
 .../gpartition/dist_partition_graph.py        | 22 +++++-
 .../gpartition/post_hard_negative.py          | 73 +++++++++++++++++++
 python/graphstorm/model/utils.py              |  3 +
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 python/graphstorm/gpartition/post_hard_negative.py

diff --git a/python/graphstorm/gpartition/__init__.py b/python/graphstorm/gpartition/__init__.py
index c7957002c2..818d6ca79a 100644
--- a/python/graphstorm/gpartition/__init__.py
+++ b/python/graphstorm/gpartition/__init__.py
@@ -19,3 +19,4 @@
 from .metis_partition import (ParMetisPartitionAlgorithm)
 from .partition_config import (ParMETISConfig)
 from .partition_algo_base import LocalPartitionAlgorithm
+from .post_hard_negative import shuffle_hard_negative_nids
\ No newline at end of file
diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index da50ce8ca6..8f26def57f 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -38,6 +38,7 @@
     ParMetisPartitionAlgorithm,
     ParMETISConfig,
     RandomPartitionAlgorithm,
+    shuffle_hard_negative_nids,
 )
 from graphstorm.utils import get_log_level
 
@@ -189,12 +190,31 @@ def main():
             dirs_exist_ok=True,
         )
 
+    # Hard Negative Mapping
+    if args.gsprocessing_config:
+        gsprocessing_config = args.gsprocessing_config
+        shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", args.output_path)
+    else:
+        for filename in os.listdir(args.input_path):
+            if filename.endswith("_with_transformations.json"):
+                gsprocessing_config = filename
+                shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
+                                           args.num_parts, args.output_path)
+                break
+        else:
+            # Did not raise error here for not introducing the break change,
+            # but will raise warning here to warn customers.
+            logging.info("Skip the hard negative node ID mapping, "
+                         "upgrade the latest GSProcessing to solve the warning here.")
+
 def parse_args() -> argparse.Namespace:
     """Parses arguments for the script"""
     argparser = argparse.ArgumentParser("Partition DGL graphs for node and edge classification "
                                         + "or regression tasks")
     argparser.add_argument("--input-path", type=str, required=True,
                            help="Path to input DGL chunked data.")
+    argparser.add_argument("--gsprocessing-config", type=str,
+                           help="Path to the input GSProcessing config data.")
     argparser.add_argument("--metadata-filename", type=str, default="metadata.json",
                            help="Name for the chunked DGL data metadata file.")
     argparser.add_argument("--output-path", type=str, required=True,
@@ -224,4 +244,4 @@ def parse_args() -> argparse.Namespace:
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
new file mode 100644
index 0000000000..32f4adf634
--- /dev/null
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -0,0 +1,73 @@
+import json
+import os
+
+import torch as th
+from dgl.data.utils import load_tensors, save_tensors
+from graphstorm.model.utils import load_dist_nid_map
+
+def load_hard_negative_config(gsprocessing_config):
+    with open(gsprocessing_config, 'r') as file:
+        config = json.load(file)
+
+    # Hard Negative only supports link prediction
+    edges_config = config['graph']['edges']
+    mapping_edge_list = []
+    for single_edge_config in edges_config:
+        if "features" not in single_edge_config:
+            continue
+        feature_dict = single_edge_config["features"]
+        for single_feature in feature_dict:
+            if single_feature["transformation"]["name"] \
+                    == "edge_dst_hard_negative":
+                edge_type = ":".join([single_edge_config["source"]["type"],
+                                     single_edge_config["relation"]["type"],
+                                     single_edge_config["dest"]["type"]])
+                hard_neg_feat_name = single_feature['name']
+                mapping_edge_list.append({"dst_node_type": single_edge_config["dest"]["type"],
+                                          "edge_type": edge_type,
+                                          "hard_neg_feat_name": hard_neg_feat_name})
+    return mapping_edge_list
+
+
+def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
+    shuffled_edge_config = load_hard_negative_config(gsprocessing_config)
+
+    node_type_list = []
+    for single_shuffled_edge_config in shuffled_edge_config:
+        node_type = single_shuffled_edge_config["dst_node_type"]
+        node_type_list.append(node_type)
+    node_mapping = load_dist_nid_map(f"{output_path}/dist_graph", node_type_list)
+    gnid2pnid_mapping = {}
+
+    def get_gnid2pnid_map(ntype):
+        if ntype in gnid2pnid_mapping:
+            return gnid2pnid_mapping[ntype]
+        else:
+            pnid2gnid_map = node_mapping[ntype]
+            gnid2pnid_map = th.argsort(pnid2gnid_map)
+            gnid2pnid_mapping[ntype] = gnid2pnid_map
+            # del ntype in node_mapping to save memory
+            del node_mapping[ntype]
+            return gnid2pnid_mapping[ntype]
+
+    # iterate all the partitions to convert hard negative node ids.
+    for i in range(num_parts):
+        part_path = os.path.join(f"{output_path}/dist_graph", f"part{i}")
+        edge_feat_path = os.path.join(part_path, "edge_feat.dgl")
+
+        # load edge features first
+        edge_feats = load_tensors(edge_feat_path)
+        for single_shuffled_edge_config in shuffled_edge_config:
+            etype = single_shuffled_edge_config["edge_type"]
+            neg_feat = single_shuffled_edge_config["hard_neg_feat_name"]
+            neg_ntype = single_shuffled_edge_config["dst_node_type"]
+            efeat_name = f"{etype}/{neg_feat}"
+            hard_nids = edge_feats[efeat_name].long()
+            hard_nid_idx = hard_nids > -1
+            gnid2pnid_map = get_gnid2pnid_map(neg_ntype)
+            hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]]
+
+        # replace the edge_feat.dgl with the updated one.
+        os.remove(edge_feat_path)
+        save_tensors(edge_feat_path, edge_feats)
+    
diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index 0969c95f5d..49fe896787 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -393,6 +393,9 @@ def _exchange_node_id_mapping(rank, world_size, device,
     # move mapping into CPU
     return gather_list[0].to(th.device("cpu"))
 
+def load_dist_nid_map(node_id_mapping_file, ntypes):
+    return _load_dist_nid_map(node_id_mapping_file, ntypes)
+
 def _load_dist_nid_map(node_id_mapping_file, ntypes):
     """ Load id mapping files in dist partition format.
     """

From 8def740f2c45aaff6a7617a041784709055b32ed Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 4 Nov 2024 18:27:43 +0000
Subject: [PATCH 09/50] add doc string

---
 .../gpartition/dist_partition_graph.py        |  3 +-
 .../gpartition/post_hard_negative.py          | 39 ++++++++++++++++++-
 python/graphstorm/model/utils.py              |  2 +
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index 8f26def57f..7f7bc46c11 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -205,7 +205,8 @@ def main():
             # Did not raise error here for not introducing the break change,
             # but will raise warning here to warn customers.
             logging.info("Skip the hard negative node ID mapping, "
-                         "upgrade the latest GSProcessing to solve the warning here.")
+                         "please upgrade to the latest GSProcessing.")
+
 
 def parse_args() -> argparse.Namespace:
     """Parses arguments for the script"""
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index 32f4adf634..2c98bbbb95 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -1,3 +1,19 @@
+"""
+    Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+"""
+
 import json
 import os
 
@@ -5,7 +21,15 @@
 from dgl.data.utils import load_tensors, save_tensors
 from graphstorm.model.utils import load_dist_nid_map
 
+
 def load_hard_negative_config(gsprocessing_config):
+    """Load GSProcessing Config to extract hard negative config
+
+    Parameters
+    ----------------
+    gsprocessing_config: str
+        Path to the gsprocessing config.
+    """
     with open(gsprocessing_config, 'r') as file:
         config = json.load(file)
 
@@ -30,6 +54,19 @@ def load_hard_negative_config(gsprocessing_config):
 
 
 def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
+    """Shuffle hard negative edge feature ids with int-to-int node id mapping.
+    The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.
+    Create an additional function to handle the id mappings under distributed setting.
+
+    Parameters
+    ----------------
+    gsprocessing_config: str
+        Path to the gsprocessing config.
+    num_parts: int
+        Number of parts.
+    output_path: str
+        Path to the output DGL graph.
+    """
     shuffled_edge_config = load_hard_negative_config(gsprocessing_config)
 
     node_type_list = []
@@ -70,4 +107,4 @@ def get_gnid2pnid_map(ntype):
         # replace the edge_feat.dgl with the updated one.
         os.remove(edge_feat_path)
         save_tensors(edge_feat_path, edge_feats)
-    
+
diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index 49fe896787..3d12ec3458 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -394,6 +394,8 @@ def _exchange_node_id_mapping(rank, world_size, device,
     return gather_list[0].to(th.device("cpu"))
 
 def load_dist_nid_map(node_id_mapping_file, ntypes):
+    """ Wrapper for load_dist_nid_map.
+    """
     return _load_dist_nid_map(node_id_mapping_file, ntypes)
 
 def _load_dist_nid_map(node_id_mapping_file, ntypes):

From 644ba29930b2dd17f31d54286fd9e6853c544b46 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 4 Nov 2024 20:02:39 +0000
Subject: [PATCH 10/50] add gsprocessing part test

---
 .../dist_hard_negative_transformation.py      |   2 +-
 .../test_dist_hard_negative_transformation.py | 107 ++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100755 graphstorm-processing/tests/test_dist_hard_negative_transformation.py

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 2b859112a6..5a7a76e38d 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -65,8 +65,8 @@ def map_values(hard_neg_list):
         return mapped_values
 
     map_values_udf = F.udf(map_values, ArrayType(IntegerType()))
-
     transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0]))
+
     return transformed_df
 
 
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
new file mode 100755
index 0000000000..cbc6b83687
--- /dev/null
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -0,0 +1,107 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import pytest
+from pyspark.sql import DataFrame, SparkSession
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
+from graphstorm_processing.data_transformations.dist_transformations import (
+    DistHardNegativeTransformation,
+)
+
+
+def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_path):
+    # Input Data DataFrame
+    data = [
+        ("mark", "doctor", ["scientist"]),
+        ("john", "scientist", ["engineer", "nurse"]),
+        ("tara", "engineer", ["nurse", "doctor", "scientist"]),
+        ("jen", "nurse", ["doctor"]),
+    ]
+    columns = ["src_type", "dst_type", "hard_negative"]
+    input_df = spark.createDataFrame(data, schema=columns)
+
+    # Mapping DataFrame
+    mapping_data = [
+        ("doctor", 0),
+        ("scientist", 1),
+        ("engineer", 2),
+        ("nurse", 3),
+    ]
+    mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT]
+    mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column)
+    mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet")
+    edge_mapping_dict = {
+        "edge_type": "src_type:relation:dst_type",
+        "mapping_path": f"{tmp_path}/raw_id_mappings/",
+        "format_name": "parquet",
+    }
+    hard_negative_transformation = DistHardNegativeTransformation(
+        ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None
+    )
+    output_df = hard_negative_transformation.apply(input_df)
+    check_df_schema(output_df)
+    output_data = output_df.collect()
+
+    expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
+
+    for idx, row in enumerate(output_data):
+        np.testing.assert_almost_equal(
+            row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
+        )
+
+
+def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_path):
+    # Input Data DataFrame
+    data = [
+        ("mark", "doctor", "scientist"),
+        ("john", "scientist", "engineer;nurse"),
+        ("tara", "engineer", "nurse;doctor;scientist"),
+        ("jen", "nurse", "doctor"),
+    ]
+    columns = ["src_type", "dst_type", "hard_negative"]
+    input_df = spark.createDataFrame(data, schema=columns)
+
+    # Mapping DataFrame
+    mapping_data = [
+        ("doctor", 0),
+        ("scientist", 1),
+        ("engineer", 2),
+        ("nurse", 3),
+    ]
+    mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT]
+    mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column)
+    mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet")
+    edge_mapping_dict = {
+        "edge_type": "src_type:relation:dst_type",
+        "mapping_path": f"{tmp_path}/raw_id_mappings/",
+        "format_name": "parquet",
+    }
+    hard_negative_transformation = DistHardNegativeTransformation(
+        ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";"
+    )
+    output_df = hard_negative_transformation.apply(input_df)
+    check_df_schema(output_df)
+    output_data = output_df.collect()
+
+    expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
+
+    for idx, row in enumerate(output_data):
+        np.testing.assert_almost_equal(
+            row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
+        )

From 1f085da8be3620223376d36930ac13ad1741a2cf Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 4 Nov 2024 23:42:26 +0000
Subject: [PATCH 11/50] add test for gspartition part

---
 tests/unit-tests/gpartition/conftest.py       |   7 +-
 .../test_hard_negative_post_partition.py      | 340 ++++++++++++++++++
 2 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit-tests/gpartition/test_hard_negative_post_partition.py

diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py
index 6e522e3a5e..9f2770e1c7 100644
--- a/tests/unit-tests/gpartition/conftest.py
+++ b/tests/unit-tests/gpartition/conftest.py
@@ -21,6 +21,8 @@
 import pytest
 
 from graphstorm.gpartition import LocalPartitionAlgorithm
+from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
+                                                      load_hard_negative_config)
 
 @pytest.fixture(scope="module", name="chunked_metadata_dict")
 def metadata_dict_fixture() -> Dict:
@@ -29,6 +31,7 @@ def metadata_dict_fixture() -> Dict:
         "node_type": ["a", "b"],
     }
 
+
 def simple_test_partition(
     partition_algorithm: LocalPartitionAlgorithm,
     algorithm_name: str,
@@ -61,7 +64,7 @@ def simple_test_partition(
         with open(os.path.join(tmpdir, "partition_meta.json"), 'r', encoding="utf-8") as f:
             part_meta = json.load(f)
             assert part_meta["num_parts"] == num_parts
-            assert part_meta["algo_name"] ==  algorithm_name
+            assert part_meta["algo_name"] == algorithm_name
 
         # Ensure contents of partition assignment files are correct
         for i, node_type in enumerate(chunked_metadata_dict["node_type"]):
@@ -70,4 +73,4 @@ def simple_test_partition(
                 assert len(node_partitions) == chunked_metadata_dict["num_nodes_per_type"][i]
                 for part_id in node_partitions:
                     assert part_id.isdigit()
-                    assert int(part_id) < num_parts
+                    assert int(part_id) < num_parts
\ No newline at end of file
diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
new file mode 100644
index 0000000000..0c3b32bf05
--- /dev/null
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -0,0 +1,340 @@
+"""
+    Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+"""
+import os
+import json
+import torch as th
+import numpy as np
+from typing import Dict
+
+import pytest
+
+from numpy.testing import assert_almost_equal
+from graphstorm.model.utils import load_dist_nid_map
+from dgl.data.utils import load_tensors, save_tensors
+from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
+                                                      load_hard_negative_config)
+
+@pytest.fixture(scope="module", name="gsprocessing_hard_negative_config")
+def gsprocessing_config_hard_negative_dict_fixture() -> Dict:
+    return{
+        "graph": {
+            "nodes": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/author.parquet"
+                        ]
+                    },
+                    "type": "author",
+                    "column": "node_id",
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/paper.parquet"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "node_id",
+                    "features": [
+                        {
+                            "column": "feat",
+                            "name": "feat",
+                            "transformation": {
+                                "name": "no-op"
+                            }
+                        }
+                    ],
+                    "labels": [
+                        {
+                            "column": "label",
+                            "type": "classification",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ],
+            "edges": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/author_writing_paper_hard_negative.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "author"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "writing"
+                    },
+                    "features": [
+                        {
+                            "column": "hard_neg",
+                            "name": "hard_neg_feat",
+                            "transformation": {
+                                "name": "edge_dst_hard_negative",
+                                "kwargs": {
+                                    "separator": ";"
+                                }
+                            }
+                        }
+                    ]
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/paper_citing_paper.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "paper"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "citing"
+                    },
+                    "labels": [
+                        {
+                            "column": "",
+                            "type": "link_prediction",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ]
+        },
+        "version": "gsprocessing-v1.0"
+    }
+
+
+@pytest.fixture(scope="module", name="gsprocessing_non_hard_negative_config")
+def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict:
+    return{
+        "graph": {
+            "nodes": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/author.parquet"
+                        ]
+                    },
+                    "type": "author",
+                    "column": "node_id",
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/paper.parquet"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "node_id",
+                    "features": [
+                        {
+                            "column": "feat",
+                            "name": "feat",
+                            "transformation": {
+                                "name": "no-op"
+                            }
+                        }
+                    ],
+                    "labels": [
+                        {
+                            "column": "label",
+                            "type": "classification",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ],
+            "edges": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/author_writing_paper_hard_negative.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "author"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "writing"
+                    }
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/paper_citing_paper.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "paper"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "citing"
+                    },
+                    "labels": [
+                        {
+                            "column": "",
+                            "type": "link_prediction",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ]
+        },
+        "version": "gsprocessing-v1.0"
+    }
+
+
+def test_load_hard_negative_config(tmp_path, gsprocessing_hard_negative_config: Dict,
+                                   gsprocessing_non_hard_negative_config: Dict):
+    # For config with gsprocessing_config.json
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res[0] == {'dst_node_type': 'paper', 'edge_type':
+        'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
+
+    # For config without hard negative feature definition
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_non_hard_negative_config,
+                  json_file, indent=4)
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res == []
+
+
+def test_shuffle_hard_negative_nids(tmp_path, gsprocessing_hard_negative_config: Dict):
+    # For config with gsprocessing_config.json
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+
+    # Generate dgl graph
+    partitioned_graph = f"{tmp_path}/partitioned_graph"
+
+    # Generate ID mapping for each partition
+    nid_map_dict_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "orig_nids.dgl")
+    nid_map_dict_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "orig_nids.dgl")
+    os.makedirs(os.path.dirname(nid_map_dict_path0), exist_ok=True)
+    os.makedirs(os.path.dirname(nid_map_dict_path1), exist_ok=True)
+
+    # Use randperm in the test otherwise there maybe no mapping necessary
+    nid_map0 = {
+        "paper": th.randperm(100),
+        "author": th.arange(200, 300)
+    }
+    save_tensors(nid_map_dict_path0, nid_map0)
+
+    nid_map1 = {
+        "paper": th.randperm(100) + 100,
+        "author": th.arange(300, 400)
+    }
+    save_tensors(nid_map_dict_path1, nid_map1)
+
+    # Create reversed map
+    node_mapping = load_dist_nid_map(f"{partitioned_graph}/dist_graph", ["author", "paper"])
+    reverse_map_dst = {gid: i for i, gid in enumerate(node_mapping["paper"].tolist())}
+    reverse_map_dst[-1] = -1
+
+    # generate edge features
+    etype = ("author", "writing", "paper")
+    edge_feat_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "edge_feat.dgl")
+    edge_feat_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "edge_feat.dgl")
+    os.makedirs(os.path.dirname(edge_feat_path0), exist_ok=True)
+    os.makedirs(os.path.dirname(edge_feat_path1), exist_ok=True)
+
+    paper_writing_hard_neg0 = th.cat((th.randint(0, 100, (100, 100)),
+                                        th.full((100, 10), -1, dtype=th.int32)), dim=1)
+    paper_writing_hard_neg0_shuffled = [
+        [reverse_map_dst[nid] for nid in negs] \
+        for negs in paper_writing_hard_neg0.tolist()]
+    paper_writing_hard_neg0_shuffled = np.array(paper_writing_hard_neg0_shuffled)
+    paper_writing_hard_neg1 = th.cat((th.randint(100, 200, (100, 100)),
+                                        th.full((100, 10), -1, dtype=th.int32)), dim=1)
+    paper_writing_hard_neg1_shuffled = [
+        [reverse_map_dst[nid] for nid in negs] \
+        for negs in paper_writing_hard_neg1.tolist()]
+    paper_writing_hard_neg1_shuffled = np.array(paper_writing_hard_neg1_shuffled)
+
+    save_tensors(edge_feat_path0, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg0})
+    save_tensors(edge_feat_path1, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg1})
+
+    # Do the shuffling
+    shuffle_hard_negative_nids(json_file_path, 2, partitioned_graph)
+
+    # Assert
+    paper_writing_hard_neg0 = load_tensors(edge_feat_path0)
+    assert_almost_equal(paper_writing_hard_neg0[":".join(etype) + "/hard_neg_feat"].numpy(),
+                        paper_writing_hard_neg0_shuffled)
+    paper_writing_hard_neg1 = load_tensors(edge_feat_path1)
+    assert_almost_equal(paper_writing_hard_neg1[":".join(etype) + "/hard_neg_feat"].numpy(),
+                        paper_writing_hard_neg1_shuffled)
\ No newline at end of file

From b7bcbaab6d65b91cd7596d44e48abc351af390a9 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 4 Nov 2024 23:44:03 +0000
Subject: [PATCH 12/50] add

---
 tests/unit-tests/gpartition/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py
index 9f2770e1c7..9d1cfa7dfd 100644
--- a/tests/unit-tests/gpartition/conftest.py
+++ b/tests/unit-tests/gpartition/conftest.py
@@ -21,8 +21,7 @@
 import pytest
 
 from graphstorm.gpartition import LocalPartitionAlgorithm
-from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
-                                                      load_hard_negative_config)
+
 
 @pytest.fixture(scope="module", name="chunked_metadata_dict")
 def metadata_dict_fixture() -> Dict:

From b67836df16e94ea933b98020464a20a61788fda9 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 00:00:41 +0000
Subject: [PATCH 13/50] lint

---
 .../config/hard_negative_configs.py           |  3 ---
 .../dist_feature_transformer.py               | 11 +++++++---
 .../dist_hard_negative_transformation.py      | 21 ++++++++++++-------
 .../dist_heterogeneous_loader.py              |  6 ++++--
 4 files changed, 26 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py

diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
index 00a99c8f1b..cc732fe80c 100644
--- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
+++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
@@ -33,6 +33,3 @@ def __init__(self, config: Mapping):
         self.separator = self._transformation_kwargs.get("separator", None)
 
         self._sanity_check()
-
-    def _sanity_check(self) -> None:
-        super()._sanity_check()
\ No newline at end of file
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index a9c0a14ae5..0b540d32fd 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -39,7 +39,11 @@ class DistFeatureTransformer(object):
     """
 
     def __init__(
-        self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict, edge_mapping_dict: dict = None
+        self,
+        feature_config: FeatureConfig,
+        spark: SparkSession,
+        json_representation: dict,
+        edge_mapping_dict: dict = None,
     ):
         feat_type = feature_config.feat_type
         feat_name = feature_config.feat_name
@@ -73,8 +77,9 @@ def __init__(
         elif feat_type == "huggingface":
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         elif feat_type == "edge_dst_hard_negative":
-            self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict,
-                                                                 spark=spark, edge_mapping_dict=edge_mapping_dict)
+            self.transformation = DistHardNegativeTransformation(
+                **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict
+            )
         else:
             raise NotImplementedError(
                 f"Feature {feat_name} has type: {feat_type} that is not supported"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
old mode 100644
new mode 100755
index 5a7a76e38d..046e5adfe1
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -14,19 +14,22 @@
 limitations under the License.
 """
 
-import logging
-import os
 from typing import Sequence
-from pyspark.sql.functions import udf, split, col
+from pyspark.sql.functions import split, col
 from pyspark.sql.types import ArrayType, IntegerType, StringType
 from pyspark.sql import DataFrame, functions as F, SparkSession
 
+from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
+
 from .base_dist_transformation import DistributedTransformation
 
-from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
 
 def apply_transform(
-    cols: Sequence[str], separator: str, spark: SparkSession, input_df: DataFrame, edge_mapping_dict: dict
+    cols: Sequence[str],
+    separator: str,
+    spark: SparkSession,
+    input_df: DataFrame,
+    edge_mapping_dict: dict,
 ) -> DataFrame:
     """Applies hard negative transformation to each row.
 
@@ -51,11 +54,15 @@ def apply_transform(
     _, _, dst_type = edge_mapping_dict["edge_type"].split(":")
     mapping_prefix = edge_mapping_dict["mapping_path"]
     format_name = edge_mapping_dict["format_name"]
-    hard_negative_node_mapping = spark.read.parquet(f"{mapping_prefix}{dst_type}/{format_name}/*.parquet")
+    hard_negative_node_mapping = spark.read.parquet(
+        f"{mapping_prefix}{dst_type}/{format_name}/*.parquet"
+    )
     node_mapping_length = hard_negative_node_mapping.count()
 
     # TODO: This method may suffer from scalability issue, we can make this method to join-based solution.
-    hard_negative_node_mapping_dict = {row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()}
+    hard_negative_node_mapping_dict = {
+        row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()
+    }
 
     # Same length for feature to convert to tensor
     def map_values(hard_neg_list):
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
index ef3d5c00c2..476688776d 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -1658,9 +1658,11 @@ def _process_edge_features(
             edge_mapping_dict = {
                 "edge_type": edge_type,
                 "mapping_path": f"{self.output_prefix}/raw_id_mappings/",
-                "format_name": FORMAT_NAME
+                "format_name": FORMAT_NAME,
             }
-            transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict)
+            transformer = DistFeatureTransformer(
+                feat_conf, self.spark, json_representation, edge_mapping_dict
+            )
 
             if json_representation:
                 logging.info(

From 10d29fb10d9955beece698ceafda4a58e604d853 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 00:08:57 +0000
Subject: [PATCH 14/50] lint

---
 python/graphstorm/gpartition/dist_partition_graph.py | 5 +++--
 python/graphstorm/gpartition/post_hard_negative.py   | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index 7f7bc46c11..6a4e938b16 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -193,7 +193,8 @@ def main():
     # Hard Negative Mapping
     if args.gsprocessing_config:
         gsprocessing_config = args.gsprocessing_config
-        shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", args.output_path)
+        shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
+                                   args.num_parts, args.output_path)
     else:
         for filename in os.listdir(args.input_path):
             if filename.endswith("_with_transformations.json"):
@@ -245,4 +246,4 @@ def parse_args() -> argparse.Namespace:
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index 2c98bbbb95..be77cead5e 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -30,7 +30,7 @@ def load_hard_negative_config(gsprocessing_config):
     gsprocessing_config: str
         Path to the gsprocessing config.
     """
-    with open(gsprocessing_config, 'r') as file:
+    with open(gsprocessing_config, 'r', encoding='utf-8') as file:
         config = json.load(file)
 
     # Hard Negative only supports link prediction
@@ -107,4 +107,3 @@ def get_gnid2pnid_map(ntype):
         # replace the edge_feat.dgl with the updated one.
         os.remove(edge_feat_path)
         save_tensors(edge_feat_path, edge_feats)
-

From 3fea39b9c0915aa53b4d97a69b00bd1678051c84 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 00:12:38 +0000
Subject: [PATCH 15/50] change

---
 .../dist_transformations/dist_hard_negative_transformation.py  | 3 ++-
 python/graphstorm/gpartition/__init__.py                       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 046e5adfe1..b86505b2eb 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -59,7 +59,8 @@ def apply_transform(
     )
     node_mapping_length = hard_negative_node_mapping.count()
 
-    # TODO: This method may suffer from scalability issue, we can make this method to join-based solution.
+    # TODO: This method may suffer from scalability issue,
+    # we can make this method to join-based solution.
     hard_negative_node_mapping_dict = {
         row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()
     }
diff --git a/python/graphstorm/gpartition/__init__.py b/python/graphstorm/gpartition/__init__.py
index 818d6ca79a..b66664f68f 100644
--- a/python/graphstorm/gpartition/__init__.py
+++ b/python/graphstorm/gpartition/__init__.py
@@ -19,4 +19,4 @@
 from .metis_partition import (ParMetisPartitionAlgorithm)
 from .partition_config import (ParMETISConfig)
 from .partition_algo_base import LocalPartitionAlgorithm
-from .post_hard_negative import shuffle_hard_negative_nids
\ No newline at end of file
+from .post_hard_negative import shuffle_hard_negative_nids

From 4ad0e97374f214d4e7f134f87d474941a1aff08f Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 00:22:18 +0000
Subject: [PATCH 16/50] black lint

---
 graphstorm-processing/graphstorm_processing/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
index fc27b17686..06aa3b3d36 100644
--- a/graphstorm-processing/graphstorm_processing/constants.py
+++ b/graphstorm-processing/graphstorm_processing/constants.py
@@ -62,6 +62,7 @@
 NODE_MAPPING_STR = "orig"
 NODE_MAPPING_INT = "new"
 
+
 ################# Supported execution envs  ##############
 class ExecutionEnv(Enum):
     """Supported execution environments"""

From a4dfb617c008bd8345e58c690880a07e578e890c Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 18:18:54 +0000
Subject: [PATCH 17/50] add doc to hard negative

---
 docs/source/advanced/link-prediction.rst                   | 6 ++++--
 .../distributed/gsprocessing/input-configuration.rst       | 7 +++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/advanced/link-prediction.rst b/docs/source/advanced/link-prediction.rst
index 10d7450cb6..9bf68c34ad 100644
--- a/docs/source/advanced/link-prediction.rst
+++ b/docs/source/advanced/link-prediction.rst
@@ -236,6 +236,8 @@ impact is negligible.
 
 With DGL 1.0.4, ``fast_localuniform`` dataloader can speedup 2.4X over ``localuniform`` dataloader on training a 2 layer RGCN on MAG dataset on four g5.48x instances.
 
+.. _hard_negative_sampling:
+
 Hard Negative sampling
 -----------------------
 GraphStorm provides support for users to define hard negative edges for a positive edge during Link Prediction training.
@@ -271,10 +273,10 @@ In general, GraphStorm covers following cases:
 
 **Preparing graph data for hard negative sampling**
 
-The gconstruct pipeline of GraphStorm provides support to load hard negative data from raw input.
+Now both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input.
 Hard destination negatives can be defined through ``edge_dst_hard_negative`` transformation.
 The ``feature_col`` field of ``edge_dst_hard_negative`` must stores the raw node ids of hard destination nodes.
-The follwing example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``:
+The following example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``:
 
   .. code-block:: json
 
diff --git a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
index f44b22d47e..ca5e85fdf5 100644
--- a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
+++ b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
@@ -491,6 +491,13 @@ arguments.
         You can use a length greater than the dataset's longest sentence; or for a safe value choose 128. Make sure to check
         the model's max supported length when setting this value.
 
+-  ``edge_dst_hard_negative``
+
+   -  Encodes a hard negative edge feature for link prediction. For detail information for hard negative support, please refer to :ref:`hard_negative_sampling`.
+   -  ``kwargs``:
+      - ``separator`` (String, optional): Same as the one in the No-op operation, the separator is used to
+        split multiple input values for CSV files e.g. ``p0;s1``. If it is not provided, then the whole value
+        will be considered as a string.
 
 ..  _gsprocessing-multitask-ref:
 

From 8dfdf050dbfe0e80bdcc5396ae9a53edc9e74262 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 18:29:05 +0000
Subject: [PATCH 18/50] add doc

---
 .../dist_transformations/dist_hard_negative_transformation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index b86505b2eb..ad7e26987c 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -39,6 +39,8 @@ def apply_transform(
         List of column names to apply normalization to.
     separator: str, optional
         The separator for string input value. Only required when input value type is string.
+    spark: SparkSession
+        The spark session
     input_df : DataFrame
         The input DataFrame to apply normalization to.
     edge_mapping_dict: dict
@@ -49,7 +51,7 @@ def apply_transform(
         transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator))
     else:
         transformed_df = input_df
-    # Edge type should be (src_ntype:get_relation_name()}:dst_ntype)
+    # Edge type should be (src_ntype:get_relation_name():dst_ntype)
     # Assume all the node type in the hard negative feature should be dst node type
     _, _, dst_type = edge_mapping_dict["edge_type"].split(":")
     mapping_prefix = edge_mapping_dict["mapping_path"]

From 52bef41a9205abc3cb3f8b550a4c004c72839c41 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 18:31:18 +0000
Subject: [PATCH 19/50] reset test

---
 tests/unit-tests/gpartition/conftest.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py
index 9d1cfa7dfd..1e918698d7 100644
--- a/tests/unit-tests/gpartition/conftest.py
+++ b/tests/unit-tests/gpartition/conftest.py
@@ -22,7 +22,6 @@
 
 from graphstorm.gpartition import LocalPartitionAlgorithm
 
-
 @pytest.fixture(scope="module", name="chunked_metadata_dict")
 def metadata_dict_fixture() -> Dict:
     return {
@@ -30,7 +29,6 @@ def metadata_dict_fixture() -> Dict:
         "node_type": ["a", "b"],
     }
 
-
 def simple_test_partition(
     partition_algorithm: LocalPartitionAlgorithm,
     algorithm_name: str,
@@ -63,7 +61,7 @@ def simple_test_partition(
         with open(os.path.join(tmpdir, "partition_meta.json"), 'r', encoding="utf-8") as f:
             part_meta = json.load(f)
             assert part_meta["num_parts"] == num_parts
-            assert part_meta["algo_name"] == algorithm_name
+            assert part_meta["algo_name"] ==  algorithm_name
 
         # Ensure contents of partition assignment files are correct
         for i, node_type in enumerate(chunked_metadata_dict["node_type"]):

From 4f30c7dac036bee2a8370351db0738c27776c819 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 18:32:03 +0000
Subject: [PATCH 20/50] add test

---
 tests/unit-tests/gpartition/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py
index 1e918698d7..6e522e3a5e 100644
--- a/tests/unit-tests/gpartition/conftest.py
+++ b/tests/unit-tests/gpartition/conftest.py
@@ -70,4 +70,4 @@ def simple_test_partition(
                 assert len(node_partitions) == chunked_metadata_dict["num_nodes_per_type"][i]
                 for part_id in node_partitions:
                     assert part_id.isdigit()
-                    assert int(part_id) < num_parts
\ No newline at end of file
+                    assert int(part_id) < num_parts

From 308f833b388f6ed3768ee0230aa198d871590295 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 18:33:29 +0000
Subject: [PATCH 21/50] simplify test

---
 .../test_hard_negative_post_partition.py      | 70 ++-----------------
 1 file changed, 4 insertions(+), 66 deletions(-)

diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
index 0c3b32bf05..1d65ce7d86 100644
--- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -50,27 +50,7 @@ def gsprocessing_config_hard_negative_dict_fixture() -> Dict:
                         ]
                     },
                     "type": "paper",
-                    "column": "node_id",
-                    "features": [
-                        {
-                            "column": "feat",
-                            "name": "feat",
-                            "transformation": {
-                                "name": "no-op"
-                            }
-                        }
-                    ],
-                    "labels": [
-                        {
-                            "column": "label",
-                            "type": "classification",
-                            "split_rate": {
-                                "train": 0.8,
-                                "val": 0.1,
-                                "test": 0.1
-                            }
-                        }
-                    ]
+                    "column": "node_id"
                 }
             ],
             "edges": [
@@ -122,18 +102,7 @@ def gsprocessing_config_hard_negative_dict_fixture() -> Dict:
                     },
                     "relation": {
                         "type": "citing"
-                    },
-                    "labels": [
-                        {
-                            "column": "",
-                            "type": "link_prediction",
-                            "split_rate": {
-                                "train": 0.8,
-                                "val": 0.1,
-                                "test": 0.1
-                            }
-                        }
-                    ]
+                    }
                 }
             ]
         },
@@ -164,27 +133,7 @@ def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict:
                         ]
                     },
                     "type": "paper",
-                    "column": "node_id",
-                    "features": [
-                        {
-                            "column": "feat",
-                            "name": "feat",
-                            "transformation": {
-                                "name": "no-op"
-                            }
-                        }
-                    ],
-                    "labels": [
-                        {
-                            "column": "label",
-                            "type": "classification",
-                            "split_rate": {
-                                "train": 0.8,
-                                "val": 0.1,
-                                "test": 0.1
-                            }
-                        }
-                    ]
+                    "column": "node_id"
                 }
             ],
             "edges": [
@@ -224,18 +173,7 @@ def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict:
                     },
                     "relation": {
                         "type": "citing"
-                    },
-                    "labels": [
-                        {
-                            "column": "",
-                            "type": "link_prediction",
-                            "split_rate": {
-                                "train": 0.8,
-                                "val": 0.1,
-                                "test": 0.1
-                            }
-                        }
-                    ]
+                    }
                 }
             ]
         },

From b6876d9e200c75aef55145ba5894c902a114f68e Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 13:22:22 -0800
Subject: [PATCH 22/50] Update gconstruct_converter.py

---
 .../config/config_conversion/gconstruct_converter.py             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
index ce615e7a6a..33fe40f760 100644
--- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
+++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py
@@ -189,7 +189,6 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]:
                         "max_seq_length": gconstruct_transform_dict["max_seq_length"],
                     }
                 elif gconstruct_transform_dict["name"] == "edge_dst_hard_negative":
-                    # Not check if it is link prediction task here
                     gsp_transformation_dict["name"] = "edge_dst_hard_negative"
                     if "separator" in gconstruct_transform_dict:
                         gsp_transformation_dict["kwargs"] = {

From bf1b0eb967fce7d3fc119b946f46f931b3bd1061 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 13:22:54 -0800
Subject: [PATCH 23/50] Update hard_negative_configs.py

---
 .../graphstorm_processing/config/hard_negative_configs.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
index cc732fe80c..42d63f9d40 100644
--- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
+++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
@@ -20,7 +20,7 @@
 
 
 class HardNegativeConfig(FeatureConfig):
-    """Feature configuration for hard negative feature. Now only support link prediction
+    """Feature configuration for hard negative feature. Now only support link prediction.
 
     Supported kwargs
     ----------------

From 6d8ed9644f5a1b3540ec59a4506dcd2436cc0b31 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 13:24:46 -0800
Subject: [PATCH 24/50] Update dist_feature_transformer.py

---
 .../data_transformations/dist_feature_transformer.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index 0b540d32fd..5e5c936c7d 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -51,7 +51,7 @@ def __init__(
         self.transformation: DistributedTransformation
         # We use this to re-apply transformations
         self.json_representation = json_representation
-        # Edge mapping file location
+        # Node Mapping Info for hard negative feature transformation
         self.edge_mapping_dict = edge_mapping_dict
 
         default_kwargs = {

From 6eef513d494f04fd81cc0cdc12a953ad30054325 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 13:25:41 -0800
Subject: [PATCH 25/50] Update dist_hard_negative_transformation.py

---
 .../dist_transformations/dist_hard_negative_transformation.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index ad7e26987c..083bcb1924 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -42,7 +42,7 @@ def apply_transform(
     spark: SparkSession
         The spark session
     input_df : DataFrame
-        The input DataFrame to apply normalization to.
+        The input DataFrame to apply transformation to.
     edge_mapping_dict: dict
         The mapping dictionary contain mapping file directory and edge type
     """

From 7079f2cd6513806f58be66c185114c8548ac3d78 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 21:27:19 +0000
Subject: [PATCH 26/50] add feature transformation

---
 .../dist_hard_negative_transformation.py                  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 083bcb1924..1bcb1f1465 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -36,13 +36,13 @@ def apply_transform(
     Parameters
     ----------
     cols : Sequence[str]
-        List of column names to apply normalization to.
+        List of column names to apply normalization to
     separator: str, optional
-        The separator for string input value. Only required when input value type is string.
+        The separator for string input value. Only required when input value type is string
     spark: SparkSession
         The spark session
     input_df : DataFrame
-        The input DataFrame to apply transformation to.
+        The input DataFrame to apply transformation to
     edge_mapping_dict: dict
         The mapping dictionary contain mapping file directory and edge type
     """
@@ -86,7 +86,7 @@ class DistHardNegativeTransformation(DistributedTransformation):
     Parameters
     ----------
     separator: str, optional
-        The separator for string input value. Only required when input value type is string.
+        The separator for string input value. Only required when input value type is string
     spark: SparkSession
         The spark session
     edge_mapping_dict: dict

From 8c3c79ef06e11c5b411fc51c258f94f9bba51b34 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 6 Nov 2024 00:10:10 +0000
Subject: [PATCH 27/50] add dot

---
 .../dist_hard_negative_transformation.py         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 1bcb1f1465..94877f065a 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -36,15 +36,15 @@ def apply_transform(
     Parameters
     ----------
     cols : Sequence[str]
-        List of column names to apply normalization to
+        List of column names to apply normalization to.
     separator: str, optional
-        The separator for string input value. Only required when input value type is string
+        The separator for string input value. Only required when input value type is string.
     spark: SparkSession
-        The spark session
+        The spark session.
     input_df : DataFrame
-        The input DataFrame to apply transformation to
+        The input DataFrame to apply transformation to.
     edge_mapping_dict: dict
-        The mapping dictionary contain mapping file directory and edge type
+        The mapping dictionary contain mapping file directory and edge type.
     """
     column_type = input_df.schema[cols[0]].dataType
     if isinstance(column_type, StringType):
@@ -86,11 +86,11 @@ class DistHardNegativeTransformation(DistributedTransformation):
     Parameters
     ----------
     separator: str, optional
-        The separator for string input value. Only required when input value type is string
+        The separator for string input value. Only required when input value type is string.
     spark: SparkSession
-        The spark session
+        The spark session.
     edge_mapping_dict: dict
-        The node type and mapping directory
+        The node type and mapping directory.
     """
 
     def __init__(

From a9dd308dcb622f573a002764fb953b2bf497345c Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 6 Nov 2024 00:20:47 +0000
Subject: [PATCH 28/50] hard negative config renaming

---
 .../graphstorm_processing/config/config_parser.py           | 4 ++--
 .../graphstorm_processing/config/hard_negative_configs.py   | 2 +-
 .../data_transformations/dist_feature_transformer.py        | 4 ++--
 .../data_transformations/dist_transformations/__init__.py   | 2 +-
 .../dist_hard_negative_transformation.py                    | 4 ++--
 .../tests/test_dist_hard_negative_transformation.py         | 6 +++---
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/config/config_parser.py b/graphstorm-processing/graphstorm_processing/config/config_parser.py
index 15f323ced8..95f4ab3dd2 100644
--- a/graphstorm-processing/graphstorm_processing/config/config_parser.py
+++ b/graphstorm-processing/graphstorm_processing/config/config_parser.py
@@ -29,7 +29,7 @@
 )
 from .categorical_configs import MultiCategoricalFeatureConfig
 from .hf_configs import HFConfig
-from .hard_negative_configs import HardNegativeConfig
+from .hard_negative_configs import HardEdgeNegativeConfig
 from .data_config_base import DataStorageConfig
 
 
@@ -73,7 +73,7 @@ def parse_feat_config(feature_dict: Dict) -> FeatureConfig:
     elif transformation_name == "huggingface":
         return HFConfig(feature_dict)
     elif transformation_name == "edge_dst_hard_negative":
-        return HardNegativeConfig(feature_dict)
+        return HardEdgeNegativeConfig(feature_dict)
     else:
         raise RuntimeError(f"Unknown transformation name: '{transformation_name}'")
 
diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
index 42d63f9d40..26e99a00a6 100644
--- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
+++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py
@@ -19,7 +19,7 @@
 from .feature_config_base import FeatureConfig
 
 
-class HardNegativeConfig(FeatureConfig):
+class HardEdgeNegativeConfig(FeatureConfig):
     """Feature configuration for hard negative feature. Now only support link prediction.
 
     Supported kwargs
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index 5e5c936c7d..f30a17b383 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -28,7 +28,7 @@
     DistCategoryTransformation,
     DistMultiCategoryTransformation,
     DistHFTransformation,
-    DistHardNegativeTransformation,
+    DistHardEdgeNegativeTransformation,
 )
 
 
@@ -77,7 +77,7 @@ def __init__(
         elif feat_type == "huggingface":
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         elif feat_type == "edge_dst_hard_negative":
-            self.transformation = DistHardNegativeTransformation(
+            self.transformation = DistHardEdgeNegativeTransformation(
                 **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict
             )
         else:
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
index 5c74d4928a..959124644b 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py
@@ -15,4 +15,4 @@
 )
 from .dist_bucket_numerical_transformation import DistBucketNumericalTransformation
 from .dist_hf_transformation import DistHFTransformation
-from .dist_hard_negative_transformation import DistHardNegativeTransformation
+from .dist_hard_negative_transformation import DistHardEdgeNegativeTransformation
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 94877f065a..82a0816bde 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -80,7 +80,7 @@ def map_values(hard_neg_list):
     return transformed_df
 
 
-class DistHardNegativeTransformation(DistributedTransformation):
+class DistHardEdgeNegativeTransformation(DistributedTransformation):
     """Transformation to apply hard negative transformation.
 
     Parameters
@@ -113,4 +113,4 @@ def apply(self, input_df: DataFrame) -> DataFrame:
 
     @staticmethod
     def get_transformation_name() -> str:
-        return "DistHardNegativeTransformation"
+        return "DistHardEdgeNegativeTransformation"
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
index cbc6b83687..0814384164 100755
--- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -21,7 +21,7 @@
 
 from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
 from graphstorm_processing.data_transformations.dist_transformations import (
-    DistHardNegativeTransformation,
+    DistHardEdgeNegativeTransformation,
 )
 
 
@@ -51,7 +51,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa
         "mapping_path": f"{tmp_path}/raw_id_mappings/",
         "format_name": "parquet",
     }
-    hard_negative_transformation = DistHardNegativeTransformation(
+    hard_negative_transformation = DistHardEdgeNegativeTransformation(
         ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None
     )
     output_df = hard_negative_transformation.apply(input_df)
@@ -92,7 +92,7 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
         "mapping_path": f"{tmp_path}/raw_id_mappings/",
         "format_name": "parquet",
     }
-    hard_negative_transformation = DistHardNegativeTransformation(
+    hard_negative_transformation = DistHardEdgeNegativeTransformation(
         ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";"
     )
     output_df = hard_negative_transformation.apply(input_df)

From cd319fa7e56cfd5a1928a601c935dfe84efe95c2 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Tue, 5 Nov 2024 16:47:48 -0800
Subject: [PATCH 29/50] Update constants.py

---
 graphstorm-processing/graphstorm_processing/constants.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
index 06aa3b3d36..fc27b17686 100644
--- a/graphstorm-processing/graphstorm_processing/constants.py
+++ b/graphstorm-processing/graphstorm_processing/constants.py
@@ -62,7 +62,6 @@
 NODE_MAPPING_STR = "orig"
 NODE_MAPPING_INT = "new"
 
-
 ################# Supported execution envs  ##############
 class ExecutionEnv(Enum):
     """Supported execution environments"""

From 4f25cb9a72a8129f9c173372cf070dcb909cb57b Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 6 Nov 2024 01:00:36 +0000
Subject: [PATCH 30/50] add constant

---
 graphstorm-processing/graphstorm_processing/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
index fc27b17686..06aa3b3d36 100644
--- a/graphstorm-processing/graphstorm_processing/constants.py
+++ b/graphstorm-processing/graphstorm_processing/constants.py
@@ -62,6 +62,7 @@
 NODE_MAPPING_STR = "orig"
 NODE_MAPPING_INT = "new"
 
+
 ################# Supported execution envs  ##############
 class ExecutionEnv(Enum):
     """Supported execution environments"""

From ff7c470d211ab109797d1423211bb7b172137d11 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 09:51:32 -0800
Subject: [PATCH 31/50] Apply suggestions from code review

Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>
---
 docs/source/advanced/link-prediction.rst                    | 2 +-
 .../distributed/gsprocessing/input-configuration.rst        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced/link-prediction.rst b/docs/source/advanced/link-prediction.rst
index 3a68684e7e..f28f0ac542 100644
--- a/docs/source/advanced/link-prediction.rst
+++ b/docs/source/advanced/link-prediction.rst
@@ -274,7 +274,7 @@ In general, GraphStorm covers following cases:
 Preparing graph data for hard negative sampling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Now both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input.
+Both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input.
 Hard destination negatives can be defined through ``edge_dst_hard_negative`` transformation.
 The ``feature_col`` field of ``edge_dst_hard_negative`` must stores the raw node ids of hard destination nodes.
 The following example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``:
diff --git a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
index ca5e85fdf5..d2074a338f 100644
--- a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
+++ b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst
@@ -495,9 +495,9 @@ arguments.
 
    -  Encodes a hard negative edge feature for link prediction. For detail information for hard negative support, please refer to :ref:`hard_negative_sampling`.
    -  ``kwargs``:
-      - ``separator`` (String, optional): Same as the one in the No-op operation, the separator is used to
-        split multiple input values for CSV files e.g. ``p0;s1``. If it is not provided, then the whole value
-        will be considered as a string.
+      - ``separator`` (String, optional): The separator is used to
+        split multiple values in an input string for data in CSV files e.g. ``p0;s1``. If it is not provided, then the whole value
+        will be treated as a single string.
 
 ..  _gsprocessing-multitask-ref:
 

From dbb8c0fe63b8705f911a9e73d695333babe20e38 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 09:55:14 -0800
Subject: [PATCH 32/50] Apply suggestions from code review

Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>
---
 .../dist_hard_negative_transformation.py                     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 82a0816bde..3bc2ec1868 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -51,8 +51,9 @@ def apply_transform(
         transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator))
     else:
         transformed_df = input_df
-    # Edge type should be (src_ntype:get_relation_name():dst_ntype)
-    # Assume all the node type in the hard negative feature should be dst node type
+    # Edge type should be (src_ntype:relation_type:dst_ntype)
+    # Only support hard negative for destination nodes. Get the node type of destination nodes.
+    # TODO: support hard negative for source nodes.
     _, _, dst_type = edge_mapping_dict["edge_type"].split(":")
     mapping_prefix = edge_mapping_dict["mapping_path"]
     format_name = edge_mapping_dict["format_name"]

From ad1c7864a3af36476236cf0e3d36babf37a09cc8 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 19:48:29 +0000
Subject: [PATCH 33/50] apply comment for GSProcessing

---
 .../dist_feature_transformer.py               |  2 +-
 .../dist_hard_negative_transformation.py      | 24 +++++++++++--------
 .../dist_heterogeneous_loader.py              | 17 ++++++-------
 graphstorm-processing/tests/test_converter.py |  4 ++--
 .../test_dist_hard_negative_transformation.py | 15 ++++++++----
 5 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index f30a17b383..ec11de7ff5 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -78,7 +78,7 @@ def __init__(
             self.transformation = DistHFTransformation(**default_kwargs, **args_dict)
         elif feat_type == "edge_dst_hard_negative":
             self.transformation = DistHardEdgeNegativeTransformation(
-                **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict
+                **default_kwargs, **args_dict, spark=spark
             )
         else:
             raise NotImplementedError(
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 3bc2ec1868..6ba99f9cd6 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -29,7 +29,7 @@ def apply_transform(
     separator: str,
     spark: SparkSession,
     input_df: DataFrame,
-    edge_mapping_dict: dict,
+    hard_node_mapping_dict: dict,
 ) -> DataFrame:
     """Applies hard negative transformation to each row.
 
@@ -43,7 +43,7 @@ def apply_transform(
         The spark session.
     input_df : DataFrame
         The input DataFrame to apply transformation to.
-    edge_mapping_dict: dict
+    hard_node_mapping_dict: dict
         The mapping dictionary contain mapping file directory and edge type.
     """
     column_type = input_df.schema[cols[0]].dataType
@@ -54,9 +54,9 @@ def apply_transform(
     # Edge type should be (src_ntype:relation_type:dst_ntype)
     # Only support hard negative for destination nodes. Get the node type of destination nodes.
     # TODO: support hard negative for source nodes.
-    _, _, dst_type = edge_mapping_dict["edge_type"].split(":")
-    mapping_prefix = edge_mapping_dict["mapping_path"]
-    format_name = edge_mapping_dict["format_name"]
+    _, _, dst_type = hard_node_mapping_dict["edge_type"].split(":")
+    mapping_prefix = hard_node_mapping_dict["mapping_path"]
+    format_name = hard_node_mapping_dict["format_name"]
     hard_negative_node_mapping = spark.read.parquet(
         f"{mapping_prefix}{dst_type}/{format_name}/*.parquet"
     )
@@ -90,24 +90,28 @@ class DistHardEdgeNegativeTransformation(DistributedTransformation):
         The separator for string input value. Only required when input value type is string.
     spark: SparkSession
         The spark session.
-    edge_mapping_dict: dict
+    hard_node_mapping_dict: dict
         The node type and mapping directory.
     """
 
     def __init__(
-        self, cols: Sequence[str], spark: SparkSession, separator: str = "", edge_mapping_dict=None
+        self,
+        cols: Sequence[str],
+        spark: SparkSession,
+        separator: str = "",
+        hard_node_mapping_dict=None,
     ) -> None:
         super().__init__(cols, spark)
         self.cols = cols
         assert len(self.cols) == 1, "Hard Negative Transformation only supports single column"
         self.separator = separator
-        self.edge_mapping_dict = edge_mapping_dict
-        assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative "
+        self.hard_node_mapping_dict = hard_node_mapping_dict
+        assert self.hard_node_mapping_dict, "edge mapping dict cannot be None for hard negative "
 
     def apply(self, input_df: DataFrame) -> DataFrame:
         assert self.spark
         transformed_df = apply_transform(
-            self.cols, self.separator, self.spark, input_df, self.edge_mapping_dict
+            self.cols, self.separator, self.spark, input_df, self.hard_node_mapping_dict
         )
 
         return transformed_df
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
index 476688776d..07ce95bf2a 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -1655,14 +1655,15 @@ def _process_edge_features(
                 .get(feat_conf.feat_name, {})
             )
             # Hard Negative Transformation use case, but should be able to be reused
-            edge_mapping_dict = {
-                "edge_type": edge_type,
-                "mapping_path": f"{self.output_prefix}/raw_id_mappings/",
-                "format_name": FORMAT_NAME,
-            }
-            transformer = DistFeatureTransformer(
-                feat_conf, self.spark, json_representation, edge_mapping_dict
-            )
+            if feat_conf.feat_type == "edge_dst_hard_negative":
+                hard_node_mapping_dict = {
+                    "edge_type": edge_type,
+                    "mapping_path": f"{self.output_prefix}/raw_id_mappings/",
+                    "format_name": FORMAT_NAME,
+                }
+                feat_conf.transformation_kwargs["hard_node_mapping_dict"] = hard_node_mapping_dict
+
+            transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation)
 
             if json_representation:
                 logging.info(
diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py
index 30b04f3855..26b837c9a1 100644
--- a/graphstorm-processing/tests/test_converter.py
+++ b/graphstorm-processing/tests/test_converter.py
@@ -406,7 +406,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter):
                 {
                     "feature_col": ["author"],
                     "feature_name": "hard_negative",
-                    "transform": {"name": "edge_dst_hard_negative"},
+                    "transform": {"name": "edge_dst_hard_negative", "separator": ";"},
                 },
             ],
             "labels": [
@@ -516,7 +516,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter):
         {
             "column": "author",
             "name": "hard_negative",
-            "transformation": {"name": "edge_dst_hard_negative"},
+            "transformation": {"name": "edge_dst_hard_negative", "separator": ";"},
         },
     ]
     assert edges_output["labels"] == [
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
index 0814384164..d2aa6ddece 100755
--- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -46,22 +46,26 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa
     mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT]
     mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column)
     mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet")
-    edge_mapping_dict = {
+    hard_node_mapping_dict = {
         "edge_type": "src_type:relation:dst_type",
         "mapping_path": f"{tmp_path}/raw_id_mappings/",
         "format_name": "parquet",
     }
     hard_negative_transformation = DistHardEdgeNegativeTransformation(
-        ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None
+        ["hard_negative"],
+        spark=spark,
+        hard_node_mapping_dict=hard_node_mapping_dict,
+        separator=None,
     )
     output_df = hard_negative_transformation.apply(input_df)
     check_df_schema(output_df)
     output_data = output_df.collect()
 
+    # Length should be 4 for each tensor because there are 4 distinct nodes for dst node
     expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
 
     for idx, row in enumerate(output_data):
-        np.testing.assert_almost_equal(
+        np.testing.assert_equal(
             row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
         )
 
@@ -93,15 +97,16 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
         "format_name": "parquet",
     }
     hard_negative_transformation = DistHardEdgeNegativeTransformation(
-        ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";"
+        ["hard_negative"], spark=spark, hard_node_mapping_dict=hard_node_mapping_dict, separator=";"
     )
     output_df = hard_negative_transformation.apply(input_df)
     check_df_schema(output_df)
     output_data = output_df.collect()
 
+    # Length should be 4 for each tensor because there are 4 distinct nodes for dst node
     expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
 
     for idx, row in enumerate(output_data):
-        np.testing.assert_almost_equal(
+        np.testing.assert_equal(
             row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
         )

From 2e47f2d2c8a574a3bf08c3cdcd143f4c23981354 Mon Sep 17 00:00:00 2001
From: jalencato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 11:49:33 -0800
Subject: [PATCH 34/50] Apply suggestions from code review

Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>
---
 python/graphstorm/gpartition/post_hard_negative.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index be77cead5e..a739f20bf6 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -56,7 +56,7 @@ def load_hard_negative_config(gsprocessing_config):
 def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
     """Shuffle hard negative edge feature ids with int-to-int node id mapping.
     The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.
-    Create an additional function to handle the id mappings under distributed setting.
+    Create an additional function to handle the id mappings under the distributed setting.
 
     Parameters
     ----------------

From e631d2c90ae3b763dc55caae0d0ef3e8ea272227 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 20:50:35 +0000
Subject: [PATCH 35/50] apply comments for gspartition

---
 graphstorm-processing/tests/test_converter.py |   2 +-
 .../test_dist_hard_negative_transformation.py |   6 +-
 .../gpartition/dist_partition_graph.py        |  27 +--
 .../gpartition/post_hard_negative.py          |  14 +-
 python/graphstorm/model/utils.py              |  14 +-
 .../gsprocessing_hard_negative_config.json    |  79 ++++++++
 ...gsprocessing_non_hard_negative_config.json |  67 +++++++
 .../test_hard_negative_post_partition.py      | 181 +-----------------
 8 files changed, 186 insertions(+), 204 deletions(-)
 create mode 100644 tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json
 create mode 100644 tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json

diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py
index 26b837c9a1..a4871342d2 100644
--- a/graphstorm-processing/tests/test_converter.py
+++ b/graphstorm-processing/tests/test_converter.py
@@ -516,7 +516,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter):
         {
             "column": "author",
             "name": "hard_negative",
-            "transformation": {"name": "edge_dst_hard_negative", "separator": ";"},
+            "transformation": {"name": "edge_dst_hard_negative", "kwargs": {"separator": ";"}},
         },
     ]
     assert edges_output["labels"] == [
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
index d2aa6ddece..179a65dc3c 100755
--- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -66,7 +66,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa
 
     for idx, row in enumerate(output_data):
         np.testing.assert_equal(
-            row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
+            row[0], expected_output[idx], err_msg=f"Row {idx} is not equal"
         )
 
 
@@ -91,7 +91,7 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
     mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT]
     mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column)
     mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet")
-    edge_mapping_dict = {
+    hard_node_mapping_dict = {
         "edge_type": "src_type:relation:dst_type",
         "mapping_path": f"{tmp_path}/raw_id_mappings/",
         "format_name": "parquet",
@@ -108,5 +108,5 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
 
     for idx, row in enumerate(output_data):
         np.testing.assert_equal(
-            row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal"
+            row[0], expected_output[idx], err_msg=f"Row {idx} is not equal"
         )
diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index 6a4e938b16..58ed0a57a5 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -191,22 +191,15 @@ def main():
         )
 
     # Hard Negative Mapping
-    if args.gsprocessing_config:
-        gsprocessing_config = args.gsprocessing_config
-        shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
-                                   args.num_parts, args.output_path)
-    else:
-        for filename in os.listdir(args.input_path):
-            if filename.endswith("_with_transformations.json"):
-                gsprocessing_config = filename
-                shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
-                                           args.num_parts, args.output_path)
-                break
-        else:
-            # Did not raise error here for not introducing the break change,
-            # but will raise warning here to warn customers.
-            logging.info("Skip the hard negative node ID mapping, "
-                         "please upgrade to the latest GSProcessing.")
+    # Load GSProcessing config from launch_arguments generated by GSProcessing
+    # Generated GSProcessing config will have _with_transformation suffix.
+    with open(os.path.join(args.input_path, "launch_arguments.json"),
+              "r", encoding="utf-8") as f:
+        gsprocessing_launch_arguments: Dict = json.load(f)
+    gsprocessing_config = gsprocessing_launch_arguments["config_filename"]
+    gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json")
+    shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
+                               args.num_parts, args.output_path)
 
 
 def parse_args() -> argparse.Namespace:
@@ -215,8 +208,6 @@ def parse_args() -> argparse.Namespace:
                                         + "or regression tasks")
     argparser.add_argument("--input-path", type=str, required=True,
                            help="Path to input DGL chunked data.")
-    argparser.add_argument("--gsprocessing-config", type=str,
-                           help="Path to the input GSProcessing config data.")
     argparser.add_argument("--metadata-filename", type=str, default="metadata.json",
                            help="Name for the chunked DGL data metadata file.")
     argparser.add_argument("--output-path", type=str, required=True,
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index a739f20bf6..45525894a0 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -35,7 +35,7 @@ def load_hard_negative_config(gsprocessing_config):
 
     # Hard Negative only supports link prediction
     edges_config = config['graph']['edges']
-    mapping_edge_list = []
+    hard_neg_list = []
     for single_edge_config in edges_config:
         if "features" not in single_edge_config:
             continue
@@ -47,13 +47,13 @@ def load_hard_negative_config(gsprocessing_config):
                                      single_edge_config["relation"]["type"],
                                      single_edge_config["dest"]["type"]])
                 hard_neg_feat_name = single_feature['name']
-                mapping_edge_list.append({"dst_node_type": single_edge_config["dest"]["type"],
+                hard_neg_list.append({"dst_node_type": single_edge_config["dest"]["type"],
                                           "edge_type": edge_type,
                                           "hard_neg_feat_name": hard_neg_feat_name})
-    return mapping_edge_list
+    return hard_neg_list
 
 
-def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
+def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path):
     """Shuffle hard negative edge feature ids with int-to-int node id mapping.
     The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.
     Create an additional function to handle the id mappings under the distributed setting.
@@ -64,7 +64,7 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
         Path to the gsprocessing config.
     num_parts: int
         Number of parts.
-    output_path: str
+    graph_path: str
         Path to the output DGL graph.
     """
     shuffled_edge_config = load_hard_negative_config(gsprocessing_config)
@@ -73,7 +73,7 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path):
     for single_shuffled_edge_config in shuffled_edge_config:
         node_type = single_shuffled_edge_config["dst_node_type"]
         node_type_list.append(node_type)
-    node_mapping = load_dist_nid_map(f"{output_path}/dist_graph", node_type_list)
+    node_mapping = load_dist_nid_map(f"{graph_path}/dist_graph", node_type_list)
     gnid2pnid_mapping = {}
 
     def get_gnid2pnid_map(ntype):
@@ -89,7 +89,7 @@ def get_gnid2pnid_map(ntype):
 
     # iterate all the partitions to convert hard negative node ids.
     for i in range(num_parts):
-        part_path = os.path.join(f"{output_path}/dist_graph", f"part{i}")
+        part_path = os.path.join(f"{graph_path}/dist_graph", f"part{i}")
         edge_feat_path = os.path.join(part_path, "edge_feat.dgl")
 
         # load edge features first
diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index 3d12ec3458..410bc6eac3 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -394,7 +394,19 @@ def _exchange_node_id_mapping(rank, world_size, device,
     return gather_list[0].to(th.device("cpu"))
 
 def load_dist_nid_map(node_id_mapping_file, ntypes):
-    """ Wrapper for load_dist_nid_map.
+    """ Load id mapping files in dist partition format.
+    
+        Parameters
+        ----------
+        node_id_mapping_file: str
+            Node mapping directory.
+        ntypes: list[str]
+            List of node types.
+
+        Return
+        ------
+        id_mappings: dict
+            Node mapping dictionary.
     """
     return _load_dist_nid_map(node_id_mapping_file, ntypes)
 
diff --git a/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json b/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json
new file mode 100644
index 0000000000..0d56a2cf3a
--- /dev/null
+++ b/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json
@@ -0,0 +1,79 @@
+{
+    "graph": {
+        "nodes": [
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./nodes/author.parquet"
+                    ]
+                },
+                "type": "author",
+                "column": "node_id"
+            },
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./nodes/paper.parquet"
+                    ]
+                },
+                "type": "paper",
+                "column": "node_id"
+            }
+        ],
+        "edges": [
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./edges/author_writing_paper_hard_negative.parquet"
+                    ]
+                },
+                "source": {
+                    "column": "source_id",
+                    "type": "author"
+                },
+                "dest": {
+                    "column": "dest_id",
+                    "type": "paper"
+                },
+                "relation": {
+                    "type": "writing"
+                },
+                "features": [
+                    {
+                        "column": "hard_neg",
+                        "name": "hard_neg_feat",
+                        "transformation": {
+                            "name": "edge_dst_hard_negative",
+                            "kwargs": {
+                                "separator": ";"
+                            }
+                        }
+                    }
+                ]
+            },
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./edges/paper_citing_paper.parquet"
+                    ]
+                },
+                "source": {
+                    "column": "source_id",
+                    "type": "paper"
+                },
+                "dest": {
+                    "column": "dest_id",
+                    "type": "paper"
+                },
+                "relation": {
+                    "type": "citing"
+                }
+            }
+        ]
+    },
+    "version": "gsprocessing-v1.0"
+}
\ No newline at end of file
diff --git a/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json b/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json
new file mode 100644
index 0000000000..daf5122113
--- /dev/null
+++ b/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json
@@ -0,0 +1,67 @@
+{
+    "graph": {
+        "nodes": [
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./nodes/author.parquet"
+                    ]
+                },
+                "type": "author",
+                "column": "node_id"
+            },
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./nodes/paper.parquet"
+                    ]
+                },
+                "type": "paper",
+                "column": "node_id"
+            }
+        ],
+        "edges": [
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./edges/author_writing_paper_hard_negative.parquet"
+                    ]
+                },
+                "source": {
+                    "column": "source_id",
+                    "type": "author"
+                },
+                "dest": {
+                    "column": "dest_id",
+                    "type": "paper"
+                },
+                "relation": {
+                    "type": "writing"
+                }
+            },
+            {
+                "data": {
+                    "format": "parquet",
+                    "files": [
+                        "./edges/paper_citing_paper.parquet"
+                    ]
+                },
+                "source": {
+                    "column": "source_id",
+                    "type": "paper"
+                },
+                "dest": {
+                    "column": "dest_id",
+                    "type": "paper"
+                },
+                "relation": {
+                    "type": "citing"
+                }
+            }
+        ]
+    },
+    "version": "gsprocessing-v1.0"
+}
\ No newline at end of file
diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
index 1d65ce7d86..dc21115ead 100644
--- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -27,194 +27,27 @@
 from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
                                                       load_hard_negative_config)
 
-@pytest.fixture(scope="module", name="gsprocessing_hard_negative_config")
-def gsprocessing_config_hard_negative_dict_fixture() -> Dict:
-    return{
-        "graph": {
-            "nodes": [
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./nodes/author.parquet"
-                        ]
-                    },
-                    "type": "author",
-                    "column": "node_id",
-                },
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./nodes/paper.parquet"
-                        ]
-                    },
-                    "type": "paper",
-                    "column": "node_id"
-                }
-            ],
-            "edges": [
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./edges/author_writing_paper_hard_negative.parquet"
-                        ]
-                    },
-                    "source": {
-                        "column": "source_id",
-                        "type": "author"
-                    },
-                    "dest": {
-                        "column": "dest_id",
-                        "type": "paper"
-                    },
-                    "relation": {
-                        "type": "writing"
-                    },
-                    "features": [
-                        {
-                            "column": "hard_neg",
-                            "name": "hard_neg_feat",
-                            "transformation": {
-                                "name": "edge_dst_hard_negative",
-                                "kwargs": {
-                                    "separator": ";"
-                                }
-                            }
-                        }
-                    ]
-                },
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./edges/paper_citing_paper.parquet"
-                        ]
-                    },
-                    "source": {
-                        "column": "source_id",
-                        "type": "paper"
-                    },
-                    "dest": {
-                        "column": "dest_id",
-                        "type": "paper"
-                    },
-                    "relation": {
-                        "type": "citing"
-                    }
-                }
-            ]
-        },
-        "version": "gsprocessing-v1.0"
-    }
-
 
-@pytest.fixture(scope="module", name="gsprocessing_non_hard_negative_config")
-def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict:
-    return{
-        "graph": {
-            "nodes": [
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./nodes/author.parquet"
-                        ]
-                    },
-                    "type": "author",
-                    "column": "node_id",
-                },
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./nodes/paper.parquet"
-                        ]
-                    },
-                    "type": "paper",
-                    "column": "node_id"
-                }
-            ],
-            "edges": [
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./edges/author_writing_paper_hard_negative.parquet"
-                        ]
-                    },
-                    "source": {
-                        "column": "source_id",
-                        "type": "author"
-                    },
-                    "dest": {
-                        "column": "dest_id",
-                        "type": "paper"
-                    },
-                    "relation": {
-                        "type": "writing"
-                    }
-                },
-                {
-                    "data": {
-                        "format": "parquet",
-                        "files": [
-                            "./edges/paper_citing_paper.parquet"
-                        ]
-                    },
-                    "source": {
-                        "column": "source_id",
-                        "type": "paper"
-                    },
-                    "dest": {
-                        "column": "dest_id",
-                        "type": "paper"
-                    },
-                    "relation": {
-                        "type": "citing"
-                    }
-                }
-            ]
-        },
-        "version": "gsprocessing-v1.0"
-    }
-
-
-def test_load_hard_negative_config(tmp_path, gsprocessing_hard_negative_config: Dict,
-                                   gsprocessing_non_hard_negative_config: Dict):
-    # For config with gsprocessing_config.json
-    json_file_path = f"{tmp_path}/gsprocessing_config.json"
-
-    # Write the dictionary to the JSON file
-    with open(json_file_path, 'w') as json_file:
-        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+def test_load_hard_negative_config():
+    # For config with hard negative transformation
+    json_file_path = f"./config/gsprocessing_hard_negative_config.json"
 
     res = load_hard_negative_config(json_file_path)
 
     assert res[0] == {'dst_node_type': 'paper', 'edge_type':
         'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
 
-    # For config without hard negative feature definition
-    json_file_path = f"{tmp_path}/gsprocessing_config.json"
-
-    # Write the dictionary to the JSON file
-    with open(json_file_path, 'w') as json_file:
-        json.dump(gsprocessing_non_hard_negative_config,
-                  json_file, indent=4)
+    # For config without hard negative transformation
+    json_file_path = f"./config/gsprocessing_non_hard_negative_config.json"
 
     res = load_hard_negative_config(json_file_path)
 
     assert res == []
 
 
-def test_shuffle_hard_negative_nids(tmp_path, gsprocessing_hard_negative_config: Dict):
+def test_shuffle_hard_negative_nids(tmp_path):
     # For config with gsprocessing_config.json
-    json_file_path = f"{tmp_path}/gsprocessing_config.json"
-
-    # Write the dictionary to the JSON file
-    with open(json_file_path, 'w') as json_file:
-        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+    json_file_path = f"./config/gsprocessing_hard_negative_config.json"
 
     # Generate dgl graph
     partitioned_graph = f"{tmp_path}/partitioned_graph"

From ca462117ecb1cf26125f3a66c656df93fe38f112 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 20:59:51 +0000
Subject: [PATCH 36/50] apply comment

---
 .../data_transformations/dist_feature_transformer.py      | 3 ---
 .../tests/test_dist_hard_negative_transformation.py       | 8 ++------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index ec11de7ff5..a42613b26a 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -43,7 +43,6 @@ def __init__(
         feature_config: FeatureConfig,
         spark: SparkSession,
         json_representation: dict,
-        edge_mapping_dict: dict = None,
     ):
         feat_type = feature_config.feat_type
         feat_name = feature_config.feat_name
@@ -51,8 +50,6 @@ def __init__(
         self.transformation: DistributedTransformation
         # We use this to re-apply transformations
         self.json_representation = json_representation
-        # Node Mapping Info for hard negative feature transformation
-        self.edge_mapping_dict = edge_mapping_dict
 
         default_kwargs = {
             "cols": feature_config.cols,
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
index 179a65dc3c..f153301eb7 100755
--- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -65,9 +65,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa
     expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
 
     for idx, row in enumerate(output_data):
-        np.testing.assert_equal(
-            row[0], expected_output[idx], err_msg=f"Row {idx} is not equal"
-        )
+        np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal")
 
 
 def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_path):
@@ -107,6 +105,4 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
     expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
 
     for idx, row in enumerate(output_data):
-        np.testing.assert_equal(
-            row[0], expected_output[idx], err_msg=f"Row {idx} is not equal"
-        )
+        np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal")

From eff82aa5e5077ce51b741d169429ab986c43717a Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 21:01:58 +0000
Subject: [PATCH 37/50] roll back

---
 .../data_transformations/dist_feature_transformer.py         | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
index a42613b26a..306b21aaaa 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -39,10 +39,7 @@ class DistFeatureTransformer(object):
     """
 
     def __init__(
-        self,
-        feature_config: FeatureConfig,
-        spark: SparkSession,
-        json_representation: dict,
+        self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict
     ):
         feat_type = feature_config.feat_type
         feat_name = feature_config.feat_name

From 9dd9ff5f58547209e4a96b337aff07111098c56b Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 21:04:08 +0000
Subject: [PATCH 38/50] apply comment

---
 .../dist_transformations/dist_hard_negative_transformation.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 6ba99f9cd6..131269d0c8 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -91,7 +91,7 @@ class DistHardEdgeNegativeTransformation(DistributedTransformation):
     spark: SparkSession
         The spark session.
     hard_node_mapping_dict: dict
-        The node type and mapping directory.
+        The mapping dictionary contain mapping file directory and edge type.
     """
 
     def __init__(

From 1ab7f6d2f1c510a8528a325526f899b5baed847f Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 21:39:12 +0000
Subject: [PATCH 39/50] fix test

---
 .../gpartition/test_hard_negative_post_partition.py      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
index dc21115ead..e8616c3280 100644
--- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -30,7 +30,8 @@
 
 def test_load_hard_negative_config():
     # For config with hard negative transformation
-    json_file_path = f"./config/gsprocessing_hard_negative_config.json"
+    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
+                      f"config/gsprocessing_hard_negative_config.json")
 
     res = load_hard_negative_config(json_file_path)
 
@@ -38,7 +39,8 @@ def test_load_hard_negative_config():
         'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
 
     # For config without hard negative transformation
-    json_file_path = f"./config/gsprocessing_non_hard_negative_config.json"
+    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
+                      f"config/gsprocessing_non_hard_negative_config.json")
 
     res = load_hard_negative_config(json_file_path)
 
@@ -47,7 +49,8 @@ def test_load_hard_negative_config():
 
 def test_shuffle_hard_negative_nids(tmp_path):
     # For config with gsprocessing_config.json
-    json_file_path = f"./config/gsprocessing_hard_negative_config.json"
+    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
+                      f"config/gsprocessing_hard_negative_config.json")
 
     # Generate dgl graph
     partitioned_graph = f"{tmp_path}/partitioned_graph"

From f93dff2ed2789c8d243d3422a8cc3bc68aedcdcc Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 22:48:16 +0000
Subject: [PATCH 40/50] check existense for launch_arguments.json

---
 .../gpartition/dist_partition_graph.py         | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index 58ed0a57a5..62b319254a 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -193,13 +193,17 @@ def main():
     # Hard Negative Mapping
     # Load GSProcessing config from launch_arguments generated by GSProcessing
     # Generated GSProcessing config will have _with_transformation suffix.
-    with open(os.path.join(args.input_path, "launch_arguments.json"),
-              "r", encoding="utf-8") as f:
-        gsprocessing_launch_arguments: Dict = json.load(f)
-    gsprocessing_config = gsprocessing_launch_arguments["config_filename"]
-    gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json")
-    shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
-                               args.num_parts, args.output_path)
+    if os.path.exists(args.input_path, "launch_arguments.json"):
+        with open(os.path.join(args.input_path, "launch_arguments.json"),
+                  "r", encoding="utf-8") as f:
+            gsprocessing_launch_arguments: Dict = json.load(f)
+        gsprocessing_config = gsprocessing_launch_arguments["config_filename"]
+        gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json")
+        shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}",
+                                   args.num_parts, args.output_path)
+    else:
+        logging.info("Skip the hard negative node ID mapping, "
+                     "the processed data is not generated by GSProcessing.")
 
 
 def parse_args() -> argparse.Namespace:

From b3760d8aaf6de546131ec29e2672b56ef2fc4245 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 11 Nov 2024 22:58:31 +0000
Subject: [PATCH 41/50] check existense for launch_arguments.json

---
 python/graphstorm/gpartition/dist_partition_graph.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py
index 62b319254a..fffddbe738 100644
--- a/python/graphstorm/gpartition/dist_partition_graph.py
+++ b/python/graphstorm/gpartition/dist_partition_graph.py
@@ -193,9 +193,9 @@ def main():
     # Hard Negative Mapping
     # Load GSProcessing config from launch_arguments generated by GSProcessing
     # Generated GSProcessing config will have _with_transformation suffix.
-    if os.path.exists(args.input_path, "launch_arguments.json"):
-        with open(os.path.join(args.input_path, "launch_arguments.json"),
-                  "r", encoding="utf-8") as f:
+    launch_arguments_path = os.path.join(args.input_path, "launch_arguments.json")
+    if os.path.exists(launch_arguments_path):
+        with open(launch_arguments_path, "r", encoding="utf-8") as f:
             gsprocessing_launch_arguments: Dict = json.load(f)
         gsprocessing_config = gsprocessing_launch_arguments["config_filename"]
         gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json")

From 32f3412c72833b8fec3ef2671667cf298af56682 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 02:03:00 +0000
Subject: [PATCH 42/50] change gsprocessing part

---
 .../graphstorm_processing/constants.py        |  4 ++
 .../dist_hard_negative_transformation.py      | 41 ++++++++++++-------
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py
index 06aa3b3d36..a732306ab8 100644
--- a/graphstorm-processing/graphstorm_processing/constants.py
+++ b/graphstorm-processing/graphstorm_processing/constants.py
@@ -58,6 +58,10 @@
 HUGGINGFACE_TOKENIZE = "tokenize_hf"
 HUGGINGFACE_EMB = "embedding_hf"
 
+################# Hard Negative transformations  ################
+ORDER_INDEX = "hard_negative_order_id"
+EXPLODE_HARD_NEGATIVE_VALUE = "hard_negative_exploded_single_value"
+
 ################# Node Mapping  ################
 NODE_MAPPING_STR = "orig"
 NODE_MAPPING_INT = "new"
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 131269d0c8..ffd73c5ec0 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -19,7 +19,12 @@
 from pyspark.sql.types import ArrayType, IntegerType, StringType
 from pyspark.sql import DataFrame, functions as F, SparkSession
 
-from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT
+from graphstorm_processing.constants import (
+    NODE_MAPPING_STR,
+    NODE_MAPPING_INT,
+    ORDER_INDEX,
+    EXPLODE_HARD_NEGATIVE_VALUE,
+)
 
 from .base_dist_transformation import DistributedTransformation
 
@@ -62,21 +67,29 @@ def apply_transform(
     )
     node_mapping_length = hard_negative_node_mapping.count()
 
-    # TODO: This method may suffer from scalability issue,
-    # we can make this method to join-based solution.
-    hard_negative_node_mapping_dict = {
-        row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()
-    }
+    # TODO: Use panda series to possibly improve the efficiency
+    transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id())
+    transformed_df = transformed_df.withColumn(
+        EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0]))
+    )
+    transformed_df = transformed_df.join(
+        hard_negative_node_mapping,
+        transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR],
+        "inner",
+    ).select(NODE_MAPPING_INT, ORDER_INDEX)
+    transformed_df = transformed_df.groupBy(ORDER_INDEX).agg(
+        F.collect_list(NODE_MAPPING_INT).alias(cols[0])
+    )
 
     # Same length for feature to convert to tensor
-    def map_values(hard_neg_list):
-        mapped_values = [hard_negative_node_mapping_dict.get(item, -1) for item in hard_neg_list]
-        while len(mapped_values) < node_mapping_length:
-            mapped_values.append(-1)
-        return mapped_values
-
-    map_values_udf = F.udf(map_values, ArrayType(IntegerType()))
-    transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0]))
+    def pad_mapped_values(hard_neg_list):
+        while len(hard_neg_list) < node_mapping_length:
+            hard_neg_list.append(-1)
+        return hard_neg_list
+
+    pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType()))
+    transformed_df = transformed_df.orderBy(ORDER_INDEX)
+    transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0]))
 
     return transformed_df
 

From c61bdc958d8f261fe284bea8d83f265f509ba424 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 02:54:51 +0000
Subject: [PATCH 43/50] comment

---
 .../dist_transformations/dist_hard_negative_transformation.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index ffd73c5ec0..c0dc715ad9 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -68,6 +68,7 @@ def apply_transform(
     node_mapping_length = hard_negative_node_mapping.count()
 
     # TODO: Use panda series to possibly improve the efficiency
+    # Explode the original list and join node id mapping dataframe
     transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id())
     transformed_df = transformed_df.withColumn(
         EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0]))
@@ -88,6 +89,7 @@ def pad_mapped_values(hard_neg_list):
         return hard_neg_list
 
     pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType()))
+    # Make sure it keeps the original order
     transformed_df = transformed_df.orderBy(ORDER_INDEX)
     transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0]))
 

From 90fcd751ce896c0c238b70fb9e52b368635affa7 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 19:47:26 +0000
Subject: [PATCH 44/50] hard negative processing

---
 .../dist_hard_negative_transformation.py      | 124 ++++++++----------
 python/graphstorm/gconstruct/utils.py         |  36 +++--
 .../gpartition/post_hard_negative.py          |  15 +--
 python/graphstorm/model/utils.py              |  13 +-
 .../test_hard_negative_post_partition.py      |  57 ++++----
 5 files changed, 117 insertions(+), 128 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index c0dc715ad9..5ccee36a81 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -29,92 +29,35 @@
 from .base_dist_transformation import DistributedTransformation
 
 
-def apply_transform(
-    cols: Sequence[str],
-    separator: str,
-    spark: SparkSession,
-    input_df: DataFrame,
-    hard_node_mapping_dict: dict,
-) -> DataFrame:
-    """Applies hard negative transformation to each row.
+class DistHardEdgeNegativeTransformation(DistributedTransformation):
+    """Transformation to apply hard negative transformation.
 
     Parameters
     ----------
     cols : Sequence[str]
-        List of column names to apply normalization to.
-    separator: str, optional
-        The separator for string input value. Only required when input value type is string.
+        List of column names to apply hard negative transformation to.
     spark: SparkSession
         The spark session.
-    input_df : DataFrame
-        The input DataFrame to apply transformation to.
     hard_node_mapping_dict: dict
         The mapping dictionary contain mapping file directory and edge type.
-    """
-    column_type = input_df.schema[cols[0]].dataType
-    if isinstance(column_type, StringType):
-        transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator))
-    else:
-        transformed_df = input_df
-    # Edge type should be (src_ntype:relation_type:dst_ntype)
-    # Only support hard negative for destination nodes. Get the node type of destination nodes.
-    # TODO: support hard negative for source nodes.
-    _, _, dst_type = hard_node_mapping_dict["edge_type"].split(":")
-    mapping_prefix = hard_node_mapping_dict["mapping_path"]
-    format_name = hard_node_mapping_dict["format_name"]
-    hard_negative_node_mapping = spark.read.parquet(
-        f"{mapping_prefix}{dst_type}/{format_name}/*.parquet"
-    )
-    node_mapping_length = hard_negative_node_mapping.count()
-
-    # TODO: Use panda series to possibly improve the efficiency
-    # Explode the original list and join node id mapping dataframe
-    transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id())
-    transformed_df = transformed_df.withColumn(
-        EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0]))
-    )
-    transformed_df = transformed_df.join(
-        hard_negative_node_mapping,
-        transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR],
-        "inner",
-    ).select(NODE_MAPPING_INT, ORDER_INDEX)
-    transformed_df = transformed_df.groupBy(ORDER_INDEX).agg(
-        F.collect_list(NODE_MAPPING_INT).alias(cols[0])
-    )
-
-    # Same length for feature to convert to tensor
-    def pad_mapped_values(hard_neg_list):
-        while len(hard_neg_list) < node_mapping_length:
-            hard_neg_list.append(-1)
-        return hard_neg_list
-
-    pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType()))
-    # Make sure it keeps the original order
-    transformed_df = transformed_df.orderBy(ORDER_INDEX)
-    transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0]))
-
-    return transformed_df
-
-
-class DistHardEdgeNegativeTransformation(DistributedTransformation):
-    """Transformation to apply hard negative transformation.
-
-    Parameters
-    ----------
+        {
+            "edge_type": str
+                Edge type to apply hard negative transformation.
+            "mapping_path": str
+                Path to the raw node mapping.
+            "format_name": str
+                Parquet.
+        }
     separator: str, optional
         The separator for string input value. Only required when input value type is string.
-    spark: SparkSession
-        The spark session.
-    hard_node_mapping_dict: dict
-        The mapping dictionary contain mapping file directory and edge type.
     """
 
     def __init__(
         self,
         cols: Sequence[str],
         spark: SparkSession,
+        hard_node_mapping_dict: dict,
         separator: str = "",
-        hard_node_mapping_dict=None,
     ) -> None:
         super().__init__(cols, spark)
         self.cols = cols
@@ -125,9 +68,48 @@ def __init__(
 
     def apply(self, input_df: DataFrame) -> DataFrame:
         assert self.spark
-        transformed_df = apply_transform(
-            self.cols, self.separator, self.spark, input_df, self.hard_node_mapping_dict
+        input_col = self.cols[0]
+        column_type = input_df.schema[input_col].dataType
+        if isinstance(column_type, StringType):
+            transformed_df = input_df.withColumn(input_col, split(col(input_col), self.separator))
+        else:
+            transformed_df = input_df
+        # Edge type should be (src_ntype:relation_type:dst_ntype)
+        # Only support hard negative for destination nodes. Get the node type of destination nodes.
+        # TODO: support hard negative for source nodes.
+        _, _, dst_type = self.hard_node_mapping_dict["edge_type"].split(":")
+        mapping_prefix = self.hard_node_mapping_dict["mapping_path"]
+        format_name = self.hard_node_mapping_dict["format_name"]
+        hard_negative_node_mapping = self.spark.read.parquet(
+            f"{mapping_prefix}{dst_type}/{format_name}/"
         )
+        node_mapping_length = hard_negative_node_mapping.count()
+
+        # TODO: Use panda series to possibly improve the efficiency
+        # Explode the original list and join node id mapping dataframe
+        transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id())
+        transformed_df = transformed_df.withColumn(
+            EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(input_col))
+        )
+        transformed_df = transformed_df.join(
+            hard_negative_node_mapping,
+            transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR],
+            "inner",
+        ).select(NODE_MAPPING_INT, ORDER_INDEX)
+        transformed_df = transformed_df.groupBy(ORDER_INDEX).agg(
+            F.collect_list(NODE_MAPPING_INT).alias(input_col)
+        )
+
+        # Same length for feature to convert to tensor
+        def pad_mapped_values(hard_neg_list):
+            if len(hard_neg_list) < node_mapping_length:
+                hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list)))
+            return hard_neg_list
+
+        pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType()))
+        # Make sure it keeps the original order
+        transformed_df = transformed_df.orderBy(ORDER_INDEX)
+        transformed_df = transformed_df.select(pad_value_udf(F.col(input_col)).alias(input_col))
 
         return transformed_df
 
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index e921293c2d..a9b60fd872 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -1118,6 +1118,28 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops):
 
     return hard_edge_neg_feats
 
+def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping):
+    """ Get global nid to partitioned nid mapping
+
+        Parameters
+        ----------
+        ntype: str
+            Path to the directory storing the partitioned graph.
+        node_mapping: dict of list
+            Dict of mapping. {ntype: mapping}
+        gnid2pnid_mapping: dict
+            Dict of mapping from global nid to partitioned id mapping.
+    """
+    if ntype in gnid2pnid_mapping:
+        return gnid2pnid_mapping[ntype]
+    else:
+        pnid2gnid_map = node_mapping[ntype]
+        gnid2pnid_map = th.argsort(pnid2gnid_map)
+        gnid2pnid_mapping[ntype] = gnid2pnid_map
+        # del ntype in node_mapping to save memory
+        del node_mapping[ntype]
+        return gnid2pnid_mapping[ntype]
+
 def shuffle_hard_nids(data_path, num_parts, hard_edge_neg_feats):
     """ Shuffle node ids of hard negatives from Graph node id space to
         Partition Node id space.
@@ -1136,17 +1158,6 @@ def shuffle_hard_nids(data_path, num_parts, hard_edge_neg_feats):
     node_mapping = load_maps(data_path, "node_mapping")
     gnid2pnid_mapping = {}
 
-    def get_gnid2pnid_map(ntype):
-        if ntype in gnid2pnid_mapping:
-            return gnid2pnid_mapping[ntype]
-        else:
-            pnid2gnid_map = node_mapping[ntype]
-            gnid2pnid_map = th.argsort(pnid2gnid_map)
-            gnid2pnid_mapping[ntype] = gnid2pnid_map
-            # del ntype in node_mapping to save memory
-            del node_mapping[ntype]
-            return gnid2pnid_mapping[ntype]
-
     # iterate all the partitions to convert hard negative node ids.
     for i in range(num_parts):
         part_path = os.path.join(data_path, f"part{i}")
@@ -1162,7 +1173,8 @@ def get_gnid2pnid_map(ntype):
                     efeat_name = f"{etype}/{neg_feat}"
                     hard_nids = edge_feats[efeat_name]
                     hard_nid_idx = hard_nids > -1
-                    gnid2pnid_map = get_gnid2pnid_map(neg_ntype)
+                    gnid2pnid_map = get_gnid2pnid_map(neg_ntype, node_mapping,
+                                                      gnid2pnid_mapping)
                     hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]]
 
         # replace the edge_feat.dgl with the updated one.
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index 45525894a0..cdb0930fdf 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -20,6 +20,7 @@
 import torch as th
 from dgl.data.utils import load_tensors, save_tensors
 from graphstorm.model.utils import load_dist_nid_map
+from graphstorm.gconstruct.utils import get_gnid2pnid_map
 
 
 def load_hard_negative_config(gsprocessing_config):
@@ -76,17 +77,6 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path):
     node_mapping = load_dist_nid_map(f"{graph_path}/dist_graph", node_type_list)
     gnid2pnid_mapping = {}
 
-    def get_gnid2pnid_map(ntype):
-        if ntype in gnid2pnid_mapping:
-            return gnid2pnid_mapping[ntype]
-        else:
-            pnid2gnid_map = node_mapping[ntype]
-            gnid2pnid_map = th.argsort(pnid2gnid_map)
-            gnid2pnid_mapping[ntype] = gnid2pnid_map
-            # del ntype in node_mapping to save memory
-            del node_mapping[ntype]
-            return gnid2pnid_mapping[ntype]
-
     # iterate all the partitions to convert hard negative node ids.
     for i in range(num_parts):
         part_path = os.path.join(f"{graph_path}/dist_graph", f"part{i}")
@@ -101,7 +91,8 @@ def get_gnid2pnid_map(ntype):
             efeat_name = f"{etype}/{neg_feat}"
             hard_nids = edge_feats[efeat_name].long()
             hard_nid_idx = hard_nids > -1
-            gnid2pnid_map = get_gnid2pnid_map(neg_ntype)
+            gnid2pnid_map = get_gnid2pnid_map(neg_ntype, node_mapping,
+                                              gnid2pnid_mapping)
             hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]]
 
         # replace the edge_feat.dgl with the updated one.
diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index 410bc6eac3..2499de2814 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -395,7 +395,7 @@ def _exchange_node_id_mapping(rank, world_size, device,
 
 def load_dist_nid_map(node_id_mapping_file, ntypes):
     """ Load id mapping files in dist partition format.
-    
+
         Parameters
         ----------
         node_id_mapping_file: str
@@ -408,11 +408,6 @@ def load_dist_nid_map(node_id_mapping_file, ntypes):
         id_mappings: dict
             Node mapping dictionary.
     """
-    return _load_dist_nid_map(node_id_mapping_file, ntypes)
-
-def _load_dist_nid_map(node_id_mapping_file, ntypes):
-    """ Load id mapping files in dist partition format.
-    """
     # node_id_mapping_file it is actually a directory
     # <node_id_mapping_file>/part0, <node_id_mapping_file>/part1, ...
     part_dirs = [part_path for part_path in os.listdir(node_id_mapping_file) \
@@ -464,7 +459,7 @@ def distribute_nid_map(embeddings, rank, world_size,
             else:
                 # Homogeneous graph
                 # node id mapping file from dgl tools/distpartitioning/convert_partition.py.
-                ori_node_id_mapping = _load_dist_nid_map(node_id_mapping_file, ["_N"])["_N"]
+                ori_node_id_mapping = load_dist_nid_map(node_id_mapping_file, ["_N"])["_N"]
             _, node_id_mapping = th.sort(ori_node_id_mapping)
         else:
             node_id_mapping = None
@@ -479,7 +474,7 @@ def distribute_nid_map(embeddings, rank, world_size,
                 node_id_mappings = th.load(node_id_mapping_file)
             else:
                 # node id mapping file from dgl tools/distpartitioning/convert_partition.py.
-                node_id_mappings = _load_dist_nid_map(node_id_mapping_file,
+                node_id_mappings = load_dist_nid_map(node_id_mapping_file,
                                                       list(embeddings.keys()))
         else:
             node_id_mappings = None
@@ -1189,7 +1184,7 @@ def __init__(self, g, node_id_mapping_file, ntypes=None):
             id_mappings = th.load(node_id_mapping_file) if get_rank() == 0 else None
         else:
             # node id mapping file from dgl tools/distpartitioning/convert_partition.py.
-            id_mappings = _load_dist_nid_map(node_id_mapping_file, ntypes) \
+            id_mappings = load_dist_nid_map(node_id_mapping_file, ntypes) \
                 if get_rank() == 0 else None
 
         self._id_mapping_info = {
diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
index e8616c3280..368562eb3c 100644
--- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -27,32 +27,11 @@
 from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
                                                       load_hard_negative_config)
 
+_ROOT = os.path.abspath(os.path.dirname(__file__))
 
-def test_load_hard_negative_config():
-    # For config with hard negative transformation
-    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
-                      f"config/gsprocessing_hard_negative_config.json")
-
-    res = load_hard_negative_config(json_file_path)
-
-    assert res[0] == {'dst_node_type': 'paper', 'edge_type':
-        'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
-
-    # For config without hard negative transformation
-    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
-                      f"config/gsprocessing_non_hard_negative_config.json")
 
-    res = load_hard_negative_config(json_file_path)
-
-    assert res == []
-
-
-def test_shuffle_hard_negative_nids(tmp_path):
-    # For config with gsprocessing_config.json
-    json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/"
-                      f"config/gsprocessing_hard_negative_config.json")
-
-    # Generate dgl graph
+@pytest.fixture
+def setup_graph_partition(tmp_path):
     partitioned_graph = f"{tmp_path}/partitioned_graph"
 
     # Generate ID mapping for each partition
@@ -79,6 +58,36 @@ def test_shuffle_hard_negative_nids(tmp_path):
     reverse_map_dst = {gid: i for i, gid in enumerate(node_mapping["paper"].tolist())}
     reverse_map_dst[-1] = -1
 
+    return partitioned_graph, reverse_map_dst
+
+
+def test_load_hard_negative_config():
+    # For config with hard negative transformation
+    json_file_path = os.path.join(_ROOT,
+                                  "config/gsprocessing_hard_negative_config.json")
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res[0] == {'dst_node_type': 'paper', 'edge_type':
+        'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
+
+    # For config without hard negative transformation
+    json_file_path = os.path.join(_ROOT,
+                                  "config/gsprocessing_non_hard_negative_config.json")
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res == []
+
+
+def test_shuffle_hard_negative_nids(setup_graph_partition):
+    # Test the hard negative id shuffling process within distributed setting
+
+    partitioned_graph, reverse_map_dst = setup_graph_partition
+    # For config with gsprocessing_config.json
+    json_file_path = os.path.join(_ROOT,
+                                  "config/gsprocessing_hard_negative_config.json")
+
     # generate edge features
     etype = ("author", "writing", "paper")
     edge_feat_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "edge_feat.dgl")

From 32429532600006ccdb6056b4e881b3f9ead17d05 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 20:00:18 +0000
Subject: [PATCH 45/50] lint

---
 .../dist_hard_negative_transformation.py            |  3 ++-
 python/graphstorm/gconstruct/utils.py               |  8 ++++----
 python/graphstorm/gpartition/post_hard_negative.py  | 13 +++++++++++--
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 5ccee36a81..e600e6646e 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -93,7 +93,8 @@ def apply(self, input_df: DataFrame) -> DataFrame:
         )
         transformed_df = transformed_df.join(
             hard_negative_node_mapping,
-            transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR],
+            transformed_df[EXPLODE_HARD_NEGATIVE_VALUE]
+            == hard_negative_node_mapping[NODE_MAPPING_STR],
             "inner",
         ).select(NODE_MAPPING_INT, ORDER_INDEX)
         transformed_df = transformed_df.groupBy(ORDER_INDEX).agg(
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index a9b60fd872..e55d1f7173 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -1119,16 +1119,16 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops):
     return hard_edge_neg_feats
 
 def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping):
-    """ Get global nid to partitioned nid mapping
+    """ Get global nid to partitioned nid mapping.
 
         Parameters
         ----------
         ntype: str
             Path to the directory storing the partitioned graph.
-        node_mapping: dict of list
-            Dict of mapping. {ntype: mapping}
+        node_mapping: dict
+            Dict of mapping. {ntype: partitioned nid to global nid mapping}
         gnid2pnid_mapping: dict
-            Dict of mapping from global nid to partitioned id mapping.
+            Dict of mapping. {ntype: global nid to partitioned nid mapping}
     """
     if ntype in gnid2pnid_mapping:
         return gnid2pnid_mapping[ntype]
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index cdb0930fdf..402e68e302 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -17,7 +17,6 @@
 import json
 import os
 
-import torch as th
 from dgl.data.utils import load_tensors, save_tensors
 from graphstorm.model.utils import load_dist_nid_map
 from graphstorm.gconstruct.utils import get_gnid2pnid_map
@@ -30,6 +29,17 @@ def load_hard_negative_config(gsprocessing_config):
     ----------------
     gsprocessing_config: str
         Path to the gsprocessing config.
+
+    Returns
+    -------
+    list of dicts
+        A list of dict for each hard negative feature transformation.
+        Each dict will look like:
+        {
+            "dst_node_type": destination node type for hard negative,
+            "edge_type": edge_type,
+            "hard_neg_feat_name": feature name
+        }
     """
     with open(gsprocessing_config, 'r', encoding='utf-8') as file:
         config = json.load(file)
@@ -57,7 +67,6 @@ def load_hard_negative_config(gsprocessing_config):
 def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path):
     """Shuffle hard negative edge feature ids with int-to-int node id mapping.
     The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.
-    Create an additional function to handle the id mappings under the distributed setting.
 
     Parameters
     ----------------

From 0dd32f9e11a3c78c1fc0fe9e18cf2a31cb3cf2e4 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 21:33:35 +0000
Subject: [PATCH 46/50] fix comment

---
 .../dist_transformations/dist_hard_negative_transformation.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index e600e6646e..adaa270f98 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -101,7 +101,7 @@ def apply(self, input_df: DataFrame) -> DataFrame:
             F.collect_list(NODE_MAPPING_INT).alias(input_col)
         )
 
-        # Same length for feature to convert to tensor
+        # Extend the feature to the same length as total number of nodes within one node type
         def pad_mapped_values(hard_neg_list):
             if len(hard_neg_list) < node_mapping_length:
                 hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list)))

From 7929f05de81d75a534ded95427bad9bd38dbbe52 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 22:34:30 +0000
Subject: [PATCH 47/50] change maximum size

---
 .../dist_hard_negative_transformation.py                 | 9 +++++----
 .../tests/test_dist_hard_negative_transformation.py      | 8 ++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index adaa270f98..90d3b54031 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -15,7 +15,7 @@
 """
 
 from typing import Sequence
-from pyspark.sql.functions import split, col
+from pyspark.sql.functions import split, col, size
 from pyspark.sql.types import ArrayType, IntegerType, StringType
 from pyspark.sql import DataFrame, functions as F, SparkSession
 
@@ -83,7 +83,8 @@ def apply(self, input_df: DataFrame) -> DataFrame:
         hard_negative_node_mapping = self.spark.read.parquet(
             f"{mapping_prefix}{dst_type}/{format_name}/"
         )
-        node_mapping_length = hard_negative_node_mapping.count()
+        max_size = transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) \
+                                 .agg(F.max(f"{input_col}_size")).collect()[0][0]
 
         # TODO: Use panda series to possibly improve the efficiency
         # Explode the original list and join node id mapping dataframe
@@ -103,8 +104,8 @@ def apply(self, input_df: DataFrame) -> DataFrame:
 
         # Extend the feature to the same length as total number of nodes within one node type
         def pad_mapped_values(hard_neg_list):
-            if len(hard_neg_list) < node_mapping_length:
-                hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list)))
+            if len(hard_neg_list) < max_size:
+                hard_neg_list.extend([-1] * (max_size - len(hard_neg_list)))
             return hard_neg_list
 
         pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType()))
diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
index f153301eb7..8932bd1dbe 100755
--- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
+++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py
@@ -61,8 +61,8 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa
     check_df_schema(output_df)
     output_data = output_df.collect()
 
-    # Length should be 4 for each tensor because there are 4 distinct nodes for dst node
-    expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
+    # All the length should be the same as the maximum array.
+    expected_output = [[1, -1, -1], [2, 3, -1], [3, 0, 1], [0, -1, -1]]
 
     for idx, row in enumerate(output_data):
         np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal")
@@ -101,8 +101,8 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat
     check_df_schema(output_df)
     output_data = output_df.collect()
 
-    # Length should be 4 for each tensor because there are 4 distinct nodes for dst node
-    expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]]
+    # All the length should be the same as the maximum array.
+    expected_output = [[1, -1, -1], [2, 3, -1], [3, 0, 1], [0, -1, -1]]
 
     for idx, row in enumerate(output_data):
         np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal")

From d2e03dba1ae0808d4681d5cb94ca5665422e30fa Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 22:39:41 +0000
Subject: [PATCH 48/50] black

---
 .../dist_hard_negative_transformation.py                   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 90d3b54031..b700fc89fa 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -83,8 +83,11 @@ def apply(self, input_df: DataFrame) -> DataFrame:
         hard_negative_node_mapping = self.spark.read.parquet(
             f"{mapping_prefix}{dst_type}/{format_name}/"
         )
-        max_size = transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) \
-                                 .agg(F.max(f"{input_col}_size")).collect()[0][0]
+        max_size = (
+            transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size"))
+            .agg(F.max(f"{input_col}_size"))
+            .collect()[0][0]
+        )
 
         # TODO: Use panda series to possibly improve the efficiency
         # Explode the original list and join node id mapping dataframe

From d4da2ca3efcb458521afe47db032bb54414fbeea Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 12 Nov 2024 22:43:46 +0000
Subject: [PATCH 49/50] remove size

---
 .../dist_transformations/dist_hard_negative_transformation.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index b700fc89fa..2a72ab4d6c 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -15,7 +15,7 @@
 """
 
 from typing import Sequence
-from pyspark.sql.functions import split, col, size
+from pyspark.sql.functions import split, col
 from pyspark.sql.types import ArrayType, IntegerType, StringType
 from pyspark.sql import DataFrame, functions as F, SparkSession
 

From 35d4cbd3b1dc5fc4139728b59d0d2b761c841e50 Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Wed, 13 Nov 2024 23:17:01 +0000
Subject: [PATCH 50/50] test

---
 .../dist_hard_negative_transformation.py      |  4 +++-
 python/graphstorm/gconstruct/utils.py         | 23 +++++++++++++++----
 .../gpartition/post_hard_negative.py          |  5 ++--
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
index 2a72ab4d6c..35dd4005cc 100755
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py
@@ -83,6 +83,7 @@ def apply(self, input_df: DataFrame) -> DataFrame:
         hard_negative_node_mapping = self.spark.read.parquet(
             f"{mapping_prefix}{dst_type}/{format_name}/"
         )
+        # The maximum number of negatives in the input feature column
         max_size = (
             transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size"))
             .agg(F.max(f"{input_col}_size"))
@@ -92,6 +93,7 @@ def apply(self, input_df: DataFrame) -> DataFrame:
         # TODO: Use panda series to possibly improve the efficiency
         # Explode the original list and join node id mapping dataframe
         transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id())
+        # Could result in extremely large DFs in num_nodes * avg(len_of_negatives) rows
         transformed_df = transformed_df.withColumn(
             EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(input_col))
         )
@@ -105,7 +107,7 @@ def apply(self, input_df: DataFrame) -> DataFrame:
             F.collect_list(NODE_MAPPING_INT).alias(input_col)
         )
 
-        # Extend the feature to the same length as total number of nodes within one node type
+        # Extend the feature to the same length as the maximum length of the feature column
         def pad_mapped_values(hard_neg_list):
             if len(hard_neg_list) < max_size:
                 hard_neg_list.extend([-1] * (max_size - len(hard_neg_list)))
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index e55d1f7173..d2077f632e 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -1118,17 +1118,32 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops):
 
     return hard_edge_neg_feats
 
-def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping):
+def get_gnid2pnid_map(ntype: str, node_mapping: dict, gnid2pnid_mapping: dict):
     """ Get global nid to partitioned nid mapping.
 
         Parameters
         ----------
         ntype: str
-            Path to the directory storing the partitioned graph.
+            Node type.
         node_mapping: dict
-            Dict of mapping. {ntype: partitioned nid to global nid mapping}
+            Dict of mapping.
+            {
+                ntype: 1D tensor representing the mapping from
+                partition node IDs (pnid) to global node IDs (gnid).
+                Each index corresponds to a partition node IDs, and
+                the value at that index is the global node IDs.
+            }
         gnid2pnid_mapping: dict
-            Dict of mapping. {ntype: global nid to partitioned nid mapping}
+            Dict of mapping. Here are the mapping represented:
+            {
+                ntype: 1D tensor representing the mapping from
+                global node IDs (gnid) to partition node IDs (pnid).
+                Each index corresponds to a global node ID, and
+                the value at that index is the partition node ID.
+            }
+
+        Returns
+            1-D node Mapping Tensor for target node type.
     """
     if ntype in gnid2pnid_mapping:
         return gnid2pnid_mapping[ntype]
diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py
index 402e68e302..0c86d85fc5 100644
--- a/python/graphstorm/gpartition/post_hard_negative.py
+++ b/python/graphstorm/gpartition/post_hard_negative.py
@@ -22,7 +22,7 @@
 from graphstorm.gconstruct.utils import get_gnid2pnid_map
 
 
-def load_hard_negative_config(gsprocessing_config):
+def load_hard_negative_config(gsprocessing_config: str):
     """Load GSProcessing Config to extract hard negative config
 
     Parameters
@@ -64,7 +64,8 @@ def load_hard_negative_config(gsprocessing_config):
     return hard_neg_list
 
 
-def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path):
+def shuffle_hard_negative_nids(gsprocessing_config: str,
+                               num_parts: int, graph_path: str):
     """Shuffle hard negative edge feature ids with int-to-int node id mapping.
     The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.