From 84babde04eb6e6ad53bf58f9852b22647e24d2ab Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 29 Oct 2024 22:06:44 +0000 Subject: [PATCH 01/50] update gconstruct converter --- .../config_conversion/gconstruct_converter.py | 7 +++++++ graphstorm-processing/tests/test_converter.py | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index 5129254538..ce615e7a6a 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -188,6 +188,13 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: "hf_model": gconstruct_transform_dict["bert_model"], "max_seq_length": gconstruct_transform_dict["max_seq_length"], } + elif gconstruct_transform_dict["name"] == "edge_dst_hard_negative": + # Not check if it is link prediction task here + gsp_transformation_dict["name"] = "edge_dst_hard_negative" + if "separator" in gconstruct_transform_dict: + gsp_transformation_dict["kwargs"] = { + "separator": gconstruct_transform_dict["separator"], + } else: raise ValueError( "Unsupported GConstruct transformation name: " diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py index 334ef72284..30b04f3855 100644 --- a/graphstorm-processing/tests/test_converter.py +++ b/graphstorm-processing/tests/test_converter.py @@ -401,7 +401,14 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter): "files": ["/tmp/acm_raw/edges/author_writing_paper.parquet"], "source_id_col": "~from", "dest_id_col": "~to", - "features": [{"feature_col": ["author"], "feature_name": "feat"}], + "features": [ + {"feature_col": ["author"], "feature_name": "feat"}, + { + "feature_col": ["author"], + "feature_name": "hard_negative", + "transform": {"name": "edge_dst_hard_negative"}, + }, + ], "labels": [ { "label_col": "edge_col", @@ -505,7 +512,12 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter): assert edges_output["dest"] == {"column": "~to", "type": "paper"} assert edges_output["relation"] == {"type": "writing"} assert edges_output["features"] == [ - {"column": "author", "transformation": {"name": "no-op"}, "name": "feat"} + {"column": "author", "transformation": {"name": "no-op"}, "name": "feat"}, + { + "column": "author", + "name": "hard_negative", + "transformation": {"name": "edge_dst_hard_negative"}, + }, ] assert edges_output["labels"] == [ { From ba88b9f41bb4617cc448abd6bca8151c85fcc38c Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 30 Oct 2024 21:36:03 +0000 Subject: [PATCH 02/50] gsprocessing part --- .../config/config_parser.py | 3 + .../config/hard_negative_configs.py | 42 +++++++++++ .../dist_feature_transformer.py | 3 + .../dist_hard_negative_transformation.py | 73 +++++++++++++++++++ 4 files changed, 121 insertions(+) create mode 100644 graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py create mode 100644 graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py diff --git a/graphstorm-processing/graphstorm_processing/config/config_parser.py b/graphstorm-processing/graphstorm_processing/config/config_parser.py index 38e92528d8..15f323ced8 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_parser.py +++ b/graphstorm-processing/graphstorm_processing/config/config_parser.py @@ -29,6 +29,7 @@ ) from .categorical_configs import MultiCategoricalFeatureConfig from .hf_configs import HFConfig +from .hard_negative_configs import HardNegativeConfig from .data_config_base import DataStorageConfig @@ -71,6 +72,8 @@ def parse_feat_config(feature_dict: Dict) -> FeatureConfig: return MultiCategoricalFeatureConfig(feature_dict) elif transformation_name == "huggingface": return HFConfig(feature_dict) + elif transformation_name == "edge_dst_hard_negative": + return HardNegativeConfig(feature_dict) else: raise RuntimeError(f"Unknown transformation name: '{transformation_name}'") diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py new file mode 100644 index 0000000000..c4924c57e0 --- /dev/null +++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py @@ -0,0 +1,42 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"). +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Mapping + +from .feature_config_base import FeatureConfig + + +class HardNegativeConfig(FeatureConfig): + """Feature configuration for hard negative feature. Now only support link prediction + + Supported kwargs + ---------------- + separator: str, optional + The separator for string input value. Only required when input value type is string. + """ + + def __init__(self, config: Mapping): + super().__init__(config) + self.separator = self._transformation_kwargs.get("separator") + + self._sanity_check() + + def _sanity_check(self) -> None: + super()._sanity_check() + assert self.action in [ + HUGGINGFACE_TOKENIZE, + HUGGINGFACE_EMB, + ], f"huggingface action needs to be one of {HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB}" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index b68c3eeb96..57ba1b84de 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -28,6 +28,7 @@ DistCategoryTransformation, DistMultiCategoryTransformation, DistHFTransformation, + DistHardNegativeTransformation, ) @@ -69,6 +70,8 @@ def __init__( self.transformation = DistMultiCategoryTransformation(**default_kwargs, **args_dict) elif feat_type == "huggingface": self.transformation = DistHFTransformation(**default_kwargs, **args_dict) + elif feat_type == "edge_dst_hard_negative": + self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict) else: raise NotImplementedError( f"Feature {feat_name} has type: {feat_type} that is not supported" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py new file mode 100644 index 0000000000..cfc8211404 --- /dev/null +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -0,0 +1,73 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"). +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import os +from typing import Sequence +import numpy as np +import torch as th +from pyspark.sql import DataFrame +from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, StructField +from pyspark.sql.functions import udf +from transformers import AutoTokenizer, AutoModel, AutoConfig + +from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB +from .base_dist_transformation import DistributedTransformation + + +def apply_transform( + cols: Sequence[str], separator: str, input_df: DataFrame +) -> DataFrame: + """Applies hard negative transformation to each row. + + Parameters + ---------- + cols : Sequence[str] + List of column names to apply normalization to. + separator: str, optional + The separator for string input value. Only required when input value type is string. + """ + + return transformed_df + + +class DistHardNegativeTransformation(DistributedTransformation): + """Transformation to apply hard negative transformation. + + Parameters + ---------- + separator: str, optional + The separator for string input value. Only required when input value type is string. + """ + + def __init__( + self, cols: Sequence[str], separator: str = "", + ) -> None: + super().__init__(cols) + self.cols = cols + assert len(self.cols) == 1, "Hard Negative Transformation only supports single column" + self.separator = separator + + def apply(self, input_df: DataFrame) -> DataFrame: + transformed_df = apply_transform( + self.cols, self.separator, input_df + ) + + return transformed_df + + @staticmethod + def get_transformation_name() -> str: + return "DistHardNegativeTransformation" From 613814fe3fd1e8b7fae6899bee5cd9e4e26440d2 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 30 Oct 2024 21:41:30 +0000 Subject: [PATCH 03/50] hard negative config --- .../graphstorm_processing/config/hard_negative_configs.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py index c4924c57e0..00a99c8f1b 100644 --- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py +++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py @@ -30,13 +30,9 @@ class HardNegativeConfig(FeatureConfig): def __init__(self, config: Mapping): super().__init__(config) - self.separator = self._transformation_kwargs.get("separator") + self.separator = self._transformation_kwargs.get("separator", None) self._sanity_check() def _sanity_check(self) -> None: - super()._sanity_check() - assert self.action in [ - HUGGINGFACE_TOKENIZE, - HUGGINGFACE_EMB, - ], f"huggingface action needs to be one of {HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB}" + super()._sanity_check() \ No newline at end of file From 53ea8007a4fd367992ef7ebcb7a60f1e3ea54584 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 30 Oct 2024 23:17:38 +0000 Subject: [PATCH 04/50] add code file --- .../dist_feature_transformer.py | 6 ++++-- .../dist_transformations/__init__.py | 1 + .../dist_hard_negative_transformation.py | 21 ++++++++++++------- .../dist_heterogeneous_loader.py | 7 ++++++- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index 57ba1b84de..f3dfeac509 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -39,7 +39,7 @@ class DistFeatureTransformer(object): """ def __init__( - self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict + self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict, edge_mapping_dict: dict = None ): feat_type = feature_config.feat_type feat_name = feature_config.feat_name @@ -47,6 +47,8 @@ def __init__( self.transformation: DistributedTransformation # We use this to re-apply transformations self.json_representation = json_representation + # Edge mapping file location + self.edge_mapping_dict = edge_mapping_dict default_kwargs = { "cols": feature_config.cols, @@ -71,7 +73,7 @@ def __init__( elif feat_type == "huggingface": self.transformation = DistHFTransformation(**default_kwargs, **args_dict) elif feat_type == "edge_dst_hard_negative": - self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict) + self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, edge_mapping_dict=edge_mapping_dict) else: raise NotImplementedError( f"Feature {feat_name} has type: {feat_type} that is not supported" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py index 4849c53acc..5c74d4928a 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py @@ -15,3 +15,4 @@ ) from .dist_bucket_numerical_transformation import DistBucketNumericalTransformation from .dist_hf_transformation import DistHFTransformation +from .dist_hard_negative_transformation import DistHardNegativeTransformation diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index cfc8211404..34d08c975c 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -17,19 +17,14 @@ import logging import os from typing import Sequence -import numpy as np -import torch as th from pyspark.sql import DataFrame -from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, StructField from pyspark.sql.functions import udf -from transformers import AutoTokenizer, AutoModel, AutoConfig -from graphstorm_processing.constants import HUGGINGFACE_TOKENIZE, HUGGINGFACE_EMB from .base_dist_transformation import DistributedTransformation def apply_transform( - cols: Sequence[str], separator: str, input_df: DataFrame + cols: Sequence[str], separator: str, input_df: DataFrame, edge_mapping_dict: dict ) -> DataFrame: """Applies hard negative transformation to each row. @@ -39,8 +34,16 @@ def apply_transform( List of column names to apply normalization to. separator: str, optional The separator for string input value. Only required when input value type is string. + input_df : DataFrame + The input DataFrame to apply normalization to. + edge_mapping_dict: dict + The mapping dictionary contain mapping file directory and edge type """ + input_df.show() + print(edge_mapping_dict) + exit(-1) + return transformed_df @@ -54,16 +57,18 @@ class DistHardNegativeTransformation(DistributedTransformation): """ def __init__( - self, cols: Sequence[str], separator: str = "", + self, cols: Sequence[str], separator: str = "", edge_mapping_dict=None ) -> None: super().__init__(cols) self.cols = cols assert len(self.cols) == 1, "Hard Negative Transformation only supports single column" self.separator = separator + self.edge_mapping_dict = edge_mapping_dict + assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative " def apply(self, input_df: DataFrame) -> DataFrame: transformed_df = apply_transform( - self.cols, self.separator, input_df + self.cols, self.separator, input_df, self.edge_mapping_dict ) return transformed_df diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index db56720840..1e86edebf4 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -1654,7 +1654,12 @@ def _process_edge_features( .get(edge_type, {}) .get(feat_conf.feat_name, {}) ) - transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation) + # Hard Negative Transformation use case, but should be able to be reused + edge_mapping_dict = { + "edge_type": edge_type, + "mapping_path": f"{self.output_prefix}/raw_id_mappings/" + } + transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict) if json_representation: logging.info( From 9e9d35e77ed09e903599b07c8876905f7a4fb0d2 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 31 Oct 2024 22:17:59 +0000 Subject: [PATCH 05/50] finish gsprocessing related development --- .../graphstorm_processing/constants.py | 3 ++ .../dist_feature_transformer.py | 3 +- .../dist_hard_negative_transformation.py | 50 +++++++++++++++---- .../dist_heterogeneous_loader.py | 3 +- .../graph_loaders/schema_utils.py | 1 - 5 files changed, 46 insertions(+), 14 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py index cbb48f4c02..fc27b17686 100644 --- a/graphstorm-processing/graphstorm_processing/constants.py +++ b/graphstorm-processing/graphstorm_processing/constants.py @@ -58,6 +58,9 @@ HUGGINGFACE_TOKENIZE = "tokenize_hf" HUGGINGFACE_EMB = "embedding_hf" +################# Node Mapping ################ +NODE_MAPPING_STR = "orig" +NODE_MAPPING_INT = "new" ################# Supported execution envs ############## class ExecutionEnv(Enum): diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index f3dfeac509..a9c0a14ae5 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -73,7 +73,8 @@ def __init__( elif feat_type == "huggingface": self.transformation = DistHFTransformation(**default_kwargs, **args_dict) elif feat_type == "edge_dst_hard_negative": - self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, edge_mapping_dict=edge_mapping_dict) + self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, + spark=spark, edge_mapping_dict=edge_mapping_dict) else: raise NotImplementedError( f"Feature {feat_name} has type: {feat_type} that is not supported" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 34d08c975c..2b859112a6 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -17,14 +17,16 @@ import logging import os from typing import Sequence -from pyspark.sql import DataFrame -from pyspark.sql.functions import udf +from pyspark.sql.functions import udf, split, col +from pyspark.sql.types import ArrayType, IntegerType, StringType +from pyspark.sql import DataFrame, functions as F, SparkSession from .base_dist_transformation import DistributedTransformation +from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT def apply_transform( - cols: Sequence[str], separator: str, input_df: DataFrame, edge_mapping_dict: dict + cols: Sequence[str], separator: str, spark: SparkSession, input_df: DataFrame, edge_mapping_dict: dict ) -> DataFrame: """Applies hard negative transformation to each row. @@ -39,11 +41,32 @@ def apply_transform( edge_mapping_dict: dict The mapping dictionary contain mapping file directory and edge type """ - - input_df.show() - print(edge_mapping_dict) - exit(-1) - + column_type = input_df.schema[cols[0]].dataType + if isinstance(column_type, StringType): + transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator)) + else: + transformed_df = input_df + # Edge type should be (src_ntype:get_relation_name()}:dst_ntype) + # Assume all the node type in the hard negative feature should be dst node type + _, _, dst_type = edge_mapping_dict["edge_type"].split(":") + mapping_prefix = edge_mapping_dict["mapping_path"] + format_name = edge_mapping_dict["format_name"] + hard_negative_node_mapping = spark.read.parquet(f"{mapping_prefix}{dst_type}/{format_name}/*.parquet") + node_mapping_length = hard_negative_node_mapping.count() + + # TODO: This method may suffer from scalability issue, we can make this method to join-based solution. + hard_negative_node_mapping_dict = {row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()} + + # Same length for feature to convert to tensor + def map_values(hard_neg_list): + mapped_values = [hard_negative_node_mapping_dict.get(item, -1) for item in hard_neg_list] + while len(mapped_values) < node_mapping_length: + mapped_values.append(-1) + return mapped_values + + map_values_udf = F.udf(map_values, ArrayType(IntegerType())) + + transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0])) return transformed_df @@ -54,12 +77,16 @@ class DistHardNegativeTransformation(DistributedTransformation): ---------- separator: str, optional The separator for string input value. Only required when input value type is string. + spark: SparkSession + The spark session + edge_mapping_dict: dict + The node type and mapping directory """ def __init__( - self, cols: Sequence[str], separator: str = "", edge_mapping_dict=None + self, cols: Sequence[str], spark: SparkSession, separator: str = "", edge_mapping_dict=None ) -> None: - super().__init__(cols) + super().__init__(cols, spark) self.cols = cols assert len(self.cols) == 1, "Hard Negative Transformation only supports single column" self.separator = separator @@ -67,8 +94,9 @@ def __init__( assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative " def apply(self, input_df: DataFrame) -> DataFrame: + assert self.spark transformed_df = apply_transform( - self.cols, self.separator, input_df, self.edge_mapping_dict + self.cols, self.separator, self.spark, input_df, self.edge_mapping_dict ) return transformed_df diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index 1e86edebf4..ef3d5c00c2 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -1657,7 +1657,8 @@ def _process_edge_features( # Hard Negative Transformation use case, but should be able to be reused edge_mapping_dict = { "edge_type": edge_type, - "mapping_path": f"{self.output_prefix}/raw_id_mappings/" + "mapping_path": f"{self.output_prefix}/raw_id_mappings/", + "format_name": FORMAT_NAME } transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict) diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py index 19ed03869d..ae9b22a397 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py @@ -65,7 +65,6 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc if StructField(feature_col, spark_feature_type(), True) in field_list: continue field_list.append(StructField(feature_col, spark_feature_type(), True)) - return field_list From 37e30e62636afea481c88d5bfeb5a9df42babc0a Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 31 Oct 2024 22:19:26 +0000 Subject: [PATCH 06/50] add blank --- .../graphstorm_processing/graph_loaders/schema_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py index ae9b22a397..b65a5d92dc 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py @@ -65,6 +65,7 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc if StructField(feature_col, spark_feature_type(), True) in field_list: continue field_list.append(StructField(feature_col, spark_feature_type(), True)) + return field_list From 8b702e51acf463ac767963bbb7570fdaf1bcbd1f Mon Sep 17 00:00:00 2001 From: JalenCato Date: Thu, 31 Oct 2024 22:20:01 +0000 Subject: [PATCH 07/50] tab --- .../graphstorm_processing/graph_loaders/schema_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py index b65a5d92dc..19ed03869d 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/schema_utils.py @@ -65,7 +65,7 @@ def _parse_features_schema(features_objects: Sequence[FeatureConfig]) -> Sequenc if StructField(feature_col, spark_feature_type(), True) in field_list: continue field_list.append(StructField(feature_col, spark_feature_type(), True)) - + return field_list From a50b657d0cc707f3c18424affa788605946fbbc5 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Fri, 1 Nov 2024 00:06:58 +0000 Subject: [PATCH 08/50] hard negative for gspartition --- python/graphstorm/gpartition/__init__.py | 1 + .../gpartition/dist_partition_graph.py | 22 +++++- .../gpartition/post_hard_negative.py | 73 +++++++++++++++++++ python/graphstorm/model/utils.py | 3 + 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 python/graphstorm/gpartition/post_hard_negative.py diff --git a/python/graphstorm/gpartition/__init__.py b/python/graphstorm/gpartition/__init__.py index c7957002c2..818d6ca79a 100644 --- a/python/graphstorm/gpartition/__init__.py +++ b/python/graphstorm/gpartition/__init__.py @@ -19,3 +19,4 @@ from .metis_partition import (ParMetisPartitionAlgorithm) from .partition_config import (ParMETISConfig) from .partition_algo_base import LocalPartitionAlgorithm +from .post_hard_negative import shuffle_hard_negative_nids \ No newline at end of file diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index da50ce8ca6..8f26def57f 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -38,6 +38,7 @@ ParMetisPartitionAlgorithm, ParMETISConfig, RandomPartitionAlgorithm, + shuffle_hard_negative_nids, ) from graphstorm.utils import get_log_level @@ -189,12 +190,31 @@ def main(): dirs_exist_ok=True, ) + # Hard Negative Mapping + if args.gsprocessing_config: + gsprocessing_config = args.gsprocessing_config + shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", args.output_path) + else: + for filename in os.listdir(args.input_path): + if filename.endswith("_with_transformations.json"): + gsprocessing_config = filename + shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", + args.num_parts, args.output_path) + break + else: + # Did not raise error here for not introducing the break change, + # but will raise warning here to warn customers. + logging.info("Skip the hard negative node ID mapping, " + "upgrade the latest GSProcessing to solve the warning here.") + def parse_args() -> argparse.Namespace: """Parses arguments for the script""" argparser = argparse.ArgumentParser("Partition DGL graphs for node and edge classification " + "or regression tasks") argparser.add_argument("--input-path", type=str, required=True, help="Path to input DGL chunked data.") + argparser.add_argument("--gsprocessing-config", type=str, + help="Path to the input GSProcessing config data.") argparser.add_argument("--metadata-filename", type=str, default="metadata.json", help="Name for the chunked DGL data metadata file.") argparser.add_argument("--output-path", type=str, required=True, @@ -224,4 +244,4 @@ def parse_args() -> argparse.Namespace: if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py new file mode 100644 index 0000000000..32f4adf634 --- /dev/null +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -0,0 +1,73 @@ +import json +import os + +import torch as th +from dgl.data.utils import load_tensors, save_tensors +from graphstorm.model.utils import load_dist_nid_map + +def load_hard_negative_config(gsprocessing_config): + with open(gsprocessing_config, 'r') as file: + config = json.load(file) + + # Hard Negative only supports link prediction + edges_config = config['graph']['edges'] + mapping_edge_list = [] + for single_edge_config in edges_config: + if "features" not in single_edge_config: + continue + feature_dict = single_edge_config["features"] + for single_feature in feature_dict: + if single_feature["transformation"]["name"] \ + == "edge_dst_hard_negative": + edge_type = ":".join([single_edge_config["source"]["type"], + single_edge_config["relation"]["type"], + single_edge_config["dest"]["type"]]) + hard_neg_feat_name = single_feature['name'] + mapping_edge_list.append({"dst_node_type": single_edge_config["dest"]["type"], + "edge_type": edge_type, + "hard_neg_feat_name": hard_neg_feat_name}) + return mapping_edge_list + + +def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): + shuffled_edge_config = load_hard_negative_config(gsprocessing_config) + + node_type_list = [] + for single_shuffled_edge_config in shuffled_edge_config: + node_type = single_shuffled_edge_config["dst_node_type"] + node_type_list.append(node_type) + node_mapping = load_dist_nid_map(f"{output_path}/dist_graph", node_type_list) + gnid2pnid_mapping = {} + + def get_gnid2pnid_map(ntype): + if ntype in gnid2pnid_mapping: + return gnid2pnid_mapping[ntype] + else: + pnid2gnid_map = node_mapping[ntype] + gnid2pnid_map = th.argsort(pnid2gnid_map) + gnid2pnid_mapping[ntype] = gnid2pnid_map + # del ntype in node_mapping to save memory + del node_mapping[ntype] + return gnid2pnid_mapping[ntype] + + # iterate all the partitions to convert hard negative node ids. + for i in range(num_parts): + part_path = os.path.join(f"{output_path}/dist_graph", f"part{i}") + edge_feat_path = os.path.join(part_path, "edge_feat.dgl") + + # load edge features first + edge_feats = load_tensors(edge_feat_path) + for single_shuffled_edge_config in shuffled_edge_config: + etype = single_shuffled_edge_config["edge_type"] + neg_feat = single_shuffled_edge_config["hard_neg_feat_name"] + neg_ntype = single_shuffled_edge_config["dst_node_type"] + efeat_name = f"{etype}/{neg_feat}" + hard_nids = edge_feats[efeat_name].long() + hard_nid_idx = hard_nids > -1 + gnid2pnid_map = get_gnid2pnid_map(neg_ntype) + hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]] + + # replace the edge_feat.dgl with the updated one. + os.remove(edge_feat_path) + save_tensors(edge_feat_path, edge_feats) + diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index 0969c95f5d..49fe896787 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -393,6 +393,9 @@ def _exchange_node_id_mapping(rank, world_size, device, # move mapping into CPU return gather_list[0].to(th.device("cpu")) +def load_dist_nid_map(node_id_mapping_file, ntypes): + return _load_dist_nid_map(node_id_mapping_file, ntypes) + def _load_dist_nid_map(node_id_mapping_file, ntypes): """ Load id mapping files in dist partition format. """ From 8def740f2c45aaff6a7617a041784709055b32ed Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 4 Nov 2024 18:27:43 +0000 Subject: [PATCH 09/50] add doc string --- .../gpartition/dist_partition_graph.py | 3 +- .../gpartition/post_hard_negative.py | 39 ++++++++++++++++++- python/graphstorm/model/utils.py | 2 + 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index 8f26def57f..7f7bc46c11 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -205,7 +205,8 @@ def main(): # Did not raise error here for not introducing the break change, # but will raise warning here to warn customers. logging.info("Skip the hard negative node ID mapping, " - "upgrade the latest GSProcessing to solve the warning here.") + "please upgrade to the latest GSProcessing.") + def parse_args() -> argparse.Namespace: """Parses arguments for the script""" diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index 32f4adf634..2c98bbbb95 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -1,3 +1,19 @@ +""" + Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + import json import os @@ -5,7 +21,15 @@ from dgl.data.utils import load_tensors, save_tensors from graphstorm.model.utils import load_dist_nid_map + def load_hard_negative_config(gsprocessing_config): + """Load GSProcessing Config to extract hard negative config + + Parameters + ---------------- + gsprocessing_config: str + Path to the gsprocessing config. + """ with open(gsprocessing_config, 'r') as file: config = json.load(file) @@ -30,6 +54,19 @@ def load_hard_negative_config(gsprocessing_config): def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): + """Shuffle hard negative edge feature ids with int-to-int node id mapping. + The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils. + Create an additional function to handle the id mappings under distributed setting. + + Parameters + ---------------- + gsprocessing_config: str + Path to the gsprocessing config. + num_parts: int + Number of parts. + output_path: str + Path to the output DGL graph. + """ shuffled_edge_config = load_hard_negative_config(gsprocessing_config) node_type_list = [] @@ -70,4 +107,4 @@ def get_gnid2pnid_map(ntype): # replace the edge_feat.dgl with the updated one. os.remove(edge_feat_path) save_tensors(edge_feat_path, edge_feats) - + diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index 49fe896787..3d12ec3458 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -394,6 +394,8 @@ def _exchange_node_id_mapping(rank, world_size, device, return gather_list[0].to(th.device("cpu")) def load_dist_nid_map(node_id_mapping_file, ntypes): + """ Wrapper for load_dist_nid_map. + """ return _load_dist_nid_map(node_id_mapping_file, ntypes) def _load_dist_nid_map(node_id_mapping_file, ntypes): From 644ba29930b2dd17f31d54286fd9e6853c544b46 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 4 Nov 2024 20:02:39 +0000 Subject: [PATCH 10/50] add gsprocessing part test --- .../dist_hard_negative_transformation.py | 2 +- .../test_dist_hard_negative_transformation.py | 107 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100755 graphstorm-processing/tests/test_dist_hard_negative_transformation.py diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 2b859112a6..5a7a76e38d 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -65,8 +65,8 @@ def map_values(hard_neg_list): return mapped_values map_values_udf = F.udf(map_values, ArrayType(IntegerType())) - transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0])) + return transformed_df diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py new file mode 100755 index 0000000000..cbc6b83687 --- /dev/null +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -0,0 +1,107 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"). +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +from pyspark.sql import DataFrame, SparkSession +import numpy as np +from numpy.testing import assert_array_equal + +from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT +from graphstorm_processing.data_transformations.dist_transformations import ( + DistHardNegativeTransformation, +) + + +def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_path): + # Input Data DataFrame + data = [ + ("mark", "doctor", ["scientist"]), + ("john", "scientist", ["engineer", "nurse"]), + ("tara", "engineer", ["nurse", "doctor", "scientist"]), + ("jen", "nurse", ["doctor"]), + ] + columns = ["src_type", "dst_type", "hard_negative"] + input_df = spark.createDataFrame(data, schema=columns) + + # Mapping DataFrame + mapping_data = [ + ("doctor", 0), + ("scientist", 1), + ("engineer", 2), + ("nurse", 3), + ] + mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT] + mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column) + mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet") + edge_mapping_dict = { + "edge_type": "src_type:relation:dst_type", + "mapping_path": f"{tmp_path}/raw_id_mappings/", + "format_name": "parquet", + } + hard_negative_transformation = DistHardNegativeTransformation( + ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None + ) + output_df = hard_negative_transformation.apply(input_df) + check_df_schema(output_df) + output_data = output_df.collect() + + expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] + + for idx, row in enumerate(output_data): + np.testing.assert_almost_equal( + row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" + ) + + +def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_path): + # Input Data DataFrame + data = [ + ("mark", "doctor", "scientist"), + ("john", "scientist", "engineer;nurse"), + ("tara", "engineer", "nurse;doctor;scientist"), + ("jen", "nurse", "doctor"), + ] + columns = ["src_type", "dst_type", "hard_negative"] + input_df = spark.createDataFrame(data, schema=columns) + + # Mapping DataFrame + mapping_data = [ + ("doctor", 0), + ("scientist", 1), + ("engineer", 2), + ("nurse", 3), + ] + mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT] + mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column) + mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet") + edge_mapping_dict = { + "edge_type": "src_type:relation:dst_type", + "mapping_path": f"{tmp_path}/raw_id_mappings/", + "format_name": "parquet", + } + hard_negative_transformation = DistHardNegativeTransformation( + ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";" + ) + output_df = hard_negative_transformation.apply(input_df) + check_df_schema(output_df) + output_data = output_df.collect() + + expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] + + for idx, row in enumerate(output_data): + np.testing.assert_almost_equal( + row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" + ) From 1f085da8be3620223376d36930ac13ad1741a2cf Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 4 Nov 2024 23:42:26 +0000 Subject: [PATCH 11/50] add test for gspartition part --- tests/unit-tests/gpartition/conftest.py | 7 +- .../test_hard_negative_post_partition.py | 340 ++++++++++++++++++ 2 files changed, 345 insertions(+), 2 deletions(-) create mode 100644 tests/unit-tests/gpartition/test_hard_negative_post_partition.py diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py index 6e522e3a5e..9f2770e1c7 100644 --- a/tests/unit-tests/gpartition/conftest.py +++ b/tests/unit-tests/gpartition/conftest.py @@ -21,6 +21,8 @@ import pytest from graphstorm.gpartition import LocalPartitionAlgorithm +from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids, + load_hard_negative_config) @pytest.fixture(scope="module", name="chunked_metadata_dict") def metadata_dict_fixture() -> Dict: @@ -29,6 +31,7 @@ def metadata_dict_fixture() -> Dict: "node_type": ["a", "b"], } + def simple_test_partition( partition_algorithm: LocalPartitionAlgorithm, algorithm_name: str, @@ -61,7 +64,7 @@ def simple_test_partition( with open(os.path.join(tmpdir, "partition_meta.json"), 'r', encoding="utf-8") as f: part_meta = json.load(f) assert part_meta["num_parts"] == num_parts - assert part_meta["algo_name"] == algorithm_name + assert part_meta["algo_name"] == algorithm_name # Ensure contents of partition assignment files are correct for i, node_type in enumerate(chunked_metadata_dict["node_type"]): @@ -70,4 +73,4 @@ def simple_test_partition( assert len(node_partitions) == chunked_metadata_dict["num_nodes_per_type"][i] for part_id in node_partitions: assert part_id.isdigit() - assert int(part_id) < num_parts + assert int(part_id) < num_parts \ No newline at end of file diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py new file mode 100644 index 0000000000..0c3b32bf05 --- /dev/null +++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py @@ -0,0 +1,340 @@ +""" + Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import os +import json +import torch as th +import numpy as np +from typing import Dict + +import pytest + +from numpy.testing import assert_almost_equal +from graphstorm.model.utils import load_dist_nid_map +from dgl.data.utils import load_tensors, save_tensors +from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids, + load_hard_negative_config) + +@pytest.fixture(scope="module", name="gsprocessing_hard_negative_config") +def gsprocessing_config_hard_negative_dict_fixture() -> Dict: + return{ + "graph": { + "nodes": [ + { + "data": { + "format": "parquet", + "files": [ + "./nodes/author.parquet" + ] + }, + "type": "author", + "column": "node_id", + }, + { + "data": { + "format": "parquet", + "files": [ + "./nodes/paper.parquet" + ] + }, + "type": "paper", + "column": "node_id", + "features": [ + { + "column": "feat", + "name": "feat", + "transformation": { + "name": "no-op" + } + } + ], + "labels": [ + { + "column": "label", + "type": "classification", + "split_rate": { + "train": 0.8, + "val": 0.1, + "test": 0.1 + } + } + ] + } + ], + "edges": [ + { + "data": { + "format": "parquet", + "files": [ + "./edges/author_writing_paper_hard_negative.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "author" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "writing" + }, + "features": [ + { + "column": "hard_neg", + "name": "hard_neg_feat", + "transformation": { + "name": "edge_dst_hard_negative", + "kwargs": { + "separator": ";" + } + } + } + ] + }, + { + "data": { + "format": "parquet", + "files": [ + "./edges/paper_citing_paper.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "paper" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "citing" + }, + "labels": [ + { + "column": "", + "type": "link_prediction", + "split_rate": { + "train": 0.8, + "val": 0.1, + "test": 0.1 + } + } + ] + } + ] + }, + "version": "gsprocessing-v1.0" + } + + +@pytest.fixture(scope="module", name="gsprocessing_non_hard_negative_config") +def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict: + return{ + "graph": { + "nodes": [ + { + "data": { + "format": "parquet", + "files": [ + "./nodes/author.parquet" + ] + }, + "type": "author", + "column": "node_id", + }, + { + "data": { + "format": "parquet", + "files": [ + "./nodes/paper.parquet" + ] + }, + "type": "paper", + "column": "node_id", + "features": [ + { + "column": "feat", + "name": "feat", + "transformation": { + "name": "no-op" + } + } + ], + "labels": [ + { + "column": "label", + "type": "classification", + "split_rate": { + "train": 0.8, + "val": 0.1, + "test": 0.1 + } + } + ] + } + ], + "edges": [ + { + "data": { + "format": "parquet", + "files": [ + "./edges/author_writing_paper_hard_negative.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "author" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "writing" + } + }, + { + "data": { + "format": "parquet", + "files": [ + "./edges/paper_citing_paper.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "paper" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "citing" + }, + "labels": [ + { + "column": "", + "type": "link_prediction", + "split_rate": { + "train": 0.8, + "val": 0.1, + "test": 0.1 + } + } + ] + } + ] + }, + "version": "gsprocessing-v1.0" + } + + +def test_load_hard_negative_config(tmp_path, gsprocessing_hard_negative_config: Dict, + gsprocessing_non_hard_negative_config: Dict): + # For config with gsprocessing_config.json + json_file_path = f"{tmp_path}/gsprocessing_config.json" + + # Write the dictionary to the JSON file + with open(json_file_path, 'w') as json_file: + json.dump(gsprocessing_hard_negative_config, json_file, indent=4) + + res = load_hard_negative_config(json_file_path) + + assert res[0] == {'dst_node_type': 'paper', 'edge_type': + 'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'} + + # For config without hard negative feature definition + json_file_path = f"{tmp_path}/gsprocessing_config.json" + + # Write the dictionary to the JSON file + with open(json_file_path, 'w') as json_file: + json.dump(gsprocessing_non_hard_negative_config, + json_file, indent=4) + + res = load_hard_negative_config(json_file_path) + + assert res == [] + + +def test_shuffle_hard_negative_nids(tmp_path, gsprocessing_hard_negative_config: Dict): + # For config with gsprocessing_config.json + json_file_path = f"{tmp_path}/gsprocessing_config.json" + + # Write the dictionary to the JSON file + with open(json_file_path, 'w') as json_file: + json.dump(gsprocessing_hard_negative_config, json_file, indent=4) + + # Generate dgl graph + partitioned_graph = f"{tmp_path}/partitioned_graph" + + # Generate ID mapping for each partition + nid_map_dict_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "orig_nids.dgl") + nid_map_dict_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "orig_nids.dgl") + os.makedirs(os.path.dirname(nid_map_dict_path0), exist_ok=True) + os.makedirs(os.path.dirname(nid_map_dict_path1), exist_ok=True) + + # Use randperm in the test otherwise there maybe no mapping necessary + nid_map0 = { + "paper": th.randperm(100), + "author": th.arange(200, 300) + } + save_tensors(nid_map_dict_path0, nid_map0) + + nid_map1 = { + "paper": th.randperm(100) + 100, + "author": th.arange(300, 400) + } + save_tensors(nid_map_dict_path1, nid_map1) + + # Create reversed map + node_mapping = load_dist_nid_map(f"{partitioned_graph}/dist_graph", ["author", "paper"]) + reverse_map_dst = {gid: i for i, gid in enumerate(node_mapping["paper"].tolist())} + reverse_map_dst[-1] = -1 + + # generate edge features + etype = ("author", "writing", "paper") + edge_feat_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "edge_feat.dgl") + edge_feat_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "edge_feat.dgl") + os.makedirs(os.path.dirname(edge_feat_path0), exist_ok=True) + os.makedirs(os.path.dirname(edge_feat_path1), exist_ok=True) + + paper_writing_hard_neg0 = th.cat((th.randint(0, 100, (100, 100)), + th.full((100, 10), -1, dtype=th.int32)), dim=1) + paper_writing_hard_neg0_shuffled = [ + [reverse_map_dst[nid] for nid in negs] \ + for negs in paper_writing_hard_neg0.tolist()] + paper_writing_hard_neg0_shuffled = np.array(paper_writing_hard_neg0_shuffled) + paper_writing_hard_neg1 = th.cat((th.randint(100, 200, (100, 100)), + th.full((100, 10), -1, dtype=th.int32)), dim=1) + paper_writing_hard_neg1_shuffled = [ + [reverse_map_dst[nid] for nid in negs] \ + for negs in paper_writing_hard_neg1.tolist()] + paper_writing_hard_neg1_shuffled = np.array(paper_writing_hard_neg1_shuffled) + + save_tensors(edge_feat_path0, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg0}) + save_tensors(edge_feat_path1, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg1}) + + # Do the shuffling + shuffle_hard_negative_nids(json_file_path, 2, partitioned_graph) + + # Assert + paper_writing_hard_neg0 = load_tensors(edge_feat_path0) + assert_almost_equal(paper_writing_hard_neg0[":".join(etype) + "/hard_neg_feat"].numpy(), + paper_writing_hard_neg0_shuffled) + paper_writing_hard_neg1 = load_tensors(edge_feat_path1) + assert_almost_equal(paper_writing_hard_neg1[":".join(etype) + "/hard_neg_feat"].numpy(), + paper_writing_hard_neg1_shuffled) \ No newline at end of file From b7bcbaab6d65b91cd7596d44e48abc351af390a9 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 4 Nov 2024 23:44:03 +0000 Subject: [PATCH 12/50] add --- tests/unit-tests/gpartition/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py index 9f2770e1c7..9d1cfa7dfd 100644 --- a/tests/unit-tests/gpartition/conftest.py +++ b/tests/unit-tests/gpartition/conftest.py @@ -21,8 +21,7 @@ import pytest from graphstorm.gpartition import LocalPartitionAlgorithm -from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids, - load_hard_negative_config) + @pytest.fixture(scope="module", name="chunked_metadata_dict") def metadata_dict_fixture() -> Dict: From b67836df16e94ea933b98020464a20a61788fda9 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 00:00:41 +0000 Subject: [PATCH 13/50] lint --- .../config/hard_negative_configs.py | 3 --- .../dist_feature_transformer.py | 11 +++++++--- .../dist_hard_negative_transformation.py | 21 ++++++++++++------- .../dist_heterogeneous_loader.py | 6 ++++-- 4 files changed, 26 insertions(+), 15 deletions(-) mode change 100644 => 100755 graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py index 00a99c8f1b..cc732fe80c 100644 --- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py +++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py @@ -33,6 +33,3 @@ def __init__(self, config: Mapping): self.separator = self._transformation_kwargs.get("separator", None) self._sanity_check() - - def _sanity_check(self) -> None: - super()._sanity_check() \ No newline at end of file diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index a9c0a14ae5..0b540d32fd 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -39,7 +39,11 @@ class DistFeatureTransformer(object): """ def __init__( - self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict, edge_mapping_dict: dict = None + self, + feature_config: FeatureConfig, + spark: SparkSession, + json_representation: dict, + edge_mapping_dict: dict = None, ): feat_type = feature_config.feat_type feat_name = feature_config.feat_name @@ -73,8 +77,9 @@ def __init__( elif feat_type == "huggingface": self.transformation = DistHFTransformation(**default_kwargs, **args_dict) elif feat_type == "edge_dst_hard_negative": - self.transformation = DistHardNegativeTransformation(**default_kwargs, **args_dict, - spark=spark, edge_mapping_dict=edge_mapping_dict) + self.transformation = DistHardNegativeTransformation( + **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict + ) else: raise NotImplementedError( f"Feature {feat_name} has type: {feat_type} that is not supported" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py old mode 100644 new mode 100755 index 5a7a76e38d..046e5adfe1 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -14,19 +14,22 @@ limitations under the License. """ -import logging -import os from typing import Sequence -from pyspark.sql.functions import udf, split, col +from pyspark.sql.functions import split, col from pyspark.sql.types import ArrayType, IntegerType, StringType from pyspark.sql import DataFrame, functions as F, SparkSession +from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT + from .base_dist_transformation import DistributedTransformation -from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT def apply_transform( - cols: Sequence[str], separator: str, spark: SparkSession, input_df: DataFrame, edge_mapping_dict: dict + cols: Sequence[str], + separator: str, + spark: SparkSession, + input_df: DataFrame, + edge_mapping_dict: dict, ) -> DataFrame: """Applies hard negative transformation to each row. @@ -51,11 +54,15 @@ def apply_transform( _, _, dst_type = edge_mapping_dict["edge_type"].split(":") mapping_prefix = edge_mapping_dict["mapping_path"] format_name = edge_mapping_dict["format_name"] - hard_negative_node_mapping = spark.read.parquet(f"{mapping_prefix}{dst_type}/{format_name}/*.parquet") + hard_negative_node_mapping = spark.read.parquet( + f"{mapping_prefix}{dst_type}/{format_name}/*.parquet" + ) node_mapping_length = hard_negative_node_mapping.count() # TODO: This method may suffer from scalability issue, we can make this method to join-based solution. - hard_negative_node_mapping_dict = {row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect()} + hard_negative_node_mapping_dict = { + row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect() + } # Same length for feature to convert to tensor def map_values(hard_neg_list): diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index ef3d5c00c2..476688776d 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -1658,9 +1658,11 @@ def _process_edge_features( edge_mapping_dict = { "edge_type": edge_type, "mapping_path": f"{self.output_prefix}/raw_id_mappings/", - "format_name": FORMAT_NAME + "format_name": FORMAT_NAME, } - transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation, edge_mapping_dict) + transformer = DistFeatureTransformer( + feat_conf, self.spark, json_representation, edge_mapping_dict + ) if json_representation: logging.info( From 10d29fb10d9955beece698ceafda4a58e604d853 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 00:08:57 +0000 Subject: [PATCH 14/50] lint --- python/graphstorm/gpartition/dist_partition_graph.py | 5 +++-- python/graphstorm/gpartition/post_hard_negative.py | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index 7f7bc46c11..6a4e938b16 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -193,7 +193,8 @@ def main(): # Hard Negative Mapping if args.gsprocessing_config: gsprocessing_config = args.gsprocessing_config - shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", args.output_path) + shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", + args.num_parts, args.output_path) else: for filename in os.listdir(args.input_path): if filename.endswith("_with_transformations.json"): @@ -245,4 +246,4 @@ def parse_args() -> argparse.Namespace: if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index 2c98bbbb95..be77cead5e 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -30,7 +30,7 @@ def load_hard_negative_config(gsprocessing_config): gsprocessing_config: str Path to the gsprocessing config. """ - with open(gsprocessing_config, 'r') as file: + with open(gsprocessing_config, 'r', encoding='utf-8') as file: config = json.load(file) # Hard Negative only supports link prediction @@ -107,4 +107,3 @@ def get_gnid2pnid_map(ntype): # replace the edge_feat.dgl with the updated one. os.remove(edge_feat_path) save_tensors(edge_feat_path, edge_feats) - From 3fea39b9c0915aa53b4d97a69b00bd1678051c84 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 00:12:38 +0000 Subject: [PATCH 15/50] change --- .../dist_transformations/dist_hard_negative_transformation.py | 3 ++- python/graphstorm/gpartition/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 046e5adfe1..b86505b2eb 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -59,7 +59,8 @@ def apply_transform( ) node_mapping_length = hard_negative_node_mapping.count() - # TODO: This method may suffer from scalability issue, we can make this method to join-based solution. + # TODO: This method may suffer from scalability issue, + # we can make this method to join-based solution. hard_negative_node_mapping_dict = { row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect() } diff --git a/python/graphstorm/gpartition/__init__.py b/python/graphstorm/gpartition/__init__.py index 818d6ca79a..b66664f68f 100644 --- a/python/graphstorm/gpartition/__init__.py +++ b/python/graphstorm/gpartition/__init__.py @@ -19,4 +19,4 @@ from .metis_partition import (ParMetisPartitionAlgorithm) from .partition_config import (ParMETISConfig) from .partition_algo_base import LocalPartitionAlgorithm -from .post_hard_negative import shuffle_hard_negative_nids \ No newline at end of file +from .post_hard_negative import shuffle_hard_negative_nids From 4ad0e97374f214d4e7f134f87d474941a1aff08f Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 00:22:18 +0000 Subject: [PATCH 16/50] black lint --- graphstorm-processing/graphstorm_processing/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py index fc27b17686..06aa3b3d36 100644 --- a/graphstorm-processing/graphstorm_processing/constants.py +++ b/graphstorm-processing/graphstorm_processing/constants.py @@ -62,6 +62,7 @@ NODE_MAPPING_STR = "orig" NODE_MAPPING_INT = "new" + ################# Supported execution envs ############## class ExecutionEnv(Enum): """Supported execution environments""" From a4dfb617c008bd8345e58c690880a07e578e890c Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 18:18:54 +0000 Subject: [PATCH 17/50] add doc to hard negative --- docs/source/advanced/link-prediction.rst | 6 ++++-- .../distributed/gsprocessing/input-configuration.rst | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/advanced/link-prediction.rst b/docs/source/advanced/link-prediction.rst index 10d7450cb6..9bf68c34ad 100644 --- a/docs/source/advanced/link-prediction.rst +++ b/docs/source/advanced/link-prediction.rst @@ -236,6 +236,8 @@ impact is negligible. With DGL 1.0.4, ``fast_localuniform`` dataloader can speedup 2.4X over ``localuniform`` dataloader on training a 2 layer RGCN on MAG dataset on four g5.48x instances. +.. _hard_negative_sampling: + Hard Negative sampling ----------------------- GraphStorm provides support for users to define hard negative edges for a positive edge during Link Prediction training. @@ -271,10 +273,10 @@ In general, GraphStorm covers following cases: **Preparing graph data for hard negative sampling** -The gconstruct pipeline of GraphStorm provides support to load hard negative data from raw input. +Now both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input. Hard destination negatives can be defined through ``edge_dst_hard_negative`` transformation. The ``feature_col`` field of ``edge_dst_hard_negative`` must stores the raw node ids of hard destination nodes. -The follwing example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``: +The following example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``: .. code-block:: json diff --git a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst index f44b22d47e..ca5e85fdf5 100644 --- a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst +++ b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst @@ -491,6 +491,13 @@ arguments. You can use a length greater than the dataset's longest sentence; or for a safe value choose 128. Make sure to check the model's max supported length when setting this value. +- ``edge_dst_hard_negative`` + + - Encodes a hard negative edge feature for link prediction. For detail information for hard negative support, please refer to :ref:`hard_negative_sampling`. + - ``kwargs``: + - ``separator`` (String, optional): Same as the one in the No-op operation, the separator is used to + split multiple input values for CSV files e.g. ``p0;s1``. If it is not provided, then the whole value + will be considered as a string. .. _gsprocessing-multitask-ref: From 8dfdf050dbfe0e80bdcc5396ae9a53edc9e74262 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 18:29:05 +0000 Subject: [PATCH 18/50] add doc --- .../dist_transformations/dist_hard_negative_transformation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index b86505b2eb..ad7e26987c 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -39,6 +39,8 @@ def apply_transform( List of column names to apply normalization to. separator: str, optional The separator for string input value. Only required when input value type is string. + spark: SparkSession + The spark session input_df : DataFrame The input DataFrame to apply normalization to. edge_mapping_dict: dict @@ -49,7 +51,7 @@ def apply_transform( transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator)) else: transformed_df = input_df - # Edge type should be (src_ntype:get_relation_name()}:dst_ntype) + # Edge type should be (src_ntype:get_relation_name():dst_ntype) # Assume all the node type in the hard negative feature should be dst node type _, _, dst_type = edge_mapping_dict["edge_type"].split(":") mapping_prefix = edge_mapping_dict["mapping_path"] From 52bef41a9205abc3cb3f8b550a4c004c72839c41 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 18:31:18 +0000 Subject: [PATCH 19/50] reset test --- tests/unit-tests/gpartition/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py index 9d1cfa7dfd..1e918698d7 100644 --- a/tests/unit-tests/gpartition/conftest.py +++ b/tests/unit-tests/gpartition/conftest.py @@ -22,7 +22,6 @@ from graphstorm.gpartition import LocalPartitionAlgorithm - @pytest.fixture(scope="module", name="chunked_metadata_dict") def metadata_dict_fixture() -> Dict: return { @@ -30,7 +29,6 @@ def metadata_dict_fixture() -> Dict: "node_type": ["a", "b"], } - def simple_test_partition( partition_algorithm: LocalPartitionAlgorithm, algorithm_name: str, @@ -63,7 +61,7 @@ def simple_test_partition( with open(os.path.join(tmpdir, "partition_meta.json"), 'r', encoding="utf-8") as f: part_meta = json.load(f) assert part_meta["num_parts"] == num_parts - assert part_meta["algo_name"] == algorithm_name + assert part_meta["algo_name"] == algorithm_name # Ensure contents of partition assignment files are correct for i, node_type in enumerate(chunked_metadata_dict["node_type"]): From 4f30c7dac036bee2a8370351db0738c27776c819 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 18:32:03 +0000 Subject: [PATCH 20/50] add test --- tests/unit-tests/gpartition/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py index 1e918698d7..6e522e3a5e 100644 --- a/tests/unit-tests/gpartition/conftest.py +++ b/tests/unit-tests/gpartition/conftest.py @@ -70,4 +70,4 @@ def simple_test_partition( assert len(node_partitions) == chunked_metadata_dict["num_nodes_per_type"][i] for part_id in node_partitions: assert part_id.isdigit() - assert int(part_id) < num_parts \ No newline at end of file + assert int(part_id) < num_parts From 308f833b388f6ed3768ee0230aa198d871590295 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 18:33:29 +0000 Subject: [PATCH 21/50] simplify test --- .../test_hard_negative_post_partition.py | 70 ++----------------- 1 file changed, 4 insertions(+), 66 deletions(-) diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py index 0c3b32bf05..1d65ce7d86 100644 --- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py +++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py @@ -50,27 +50,7 @@ def gsprocessing_config_hard_negative_dict_fixture() -> Dict: ] }, "type": "paper", - "column": "node_id", - "features": [ - { - "column": "feat", - "name": "feat", - "transformation": { - "name": "no-op" - } - } - ], - "labels": [ - { - "column": "label", - "type": "classification", - "split_rate": { - "train": 0.8, - "val": 0.1, - "test": 0.1 - } - } - ] + "column": "node_id" } ], "edges": [ @@ -122,18 +102,7 @@ def gsprocessing_config_hard_negative_dict_fixture() -> Dict: }, "relation": { "type": "citing" - }, - "labels": [ - { - "column": "", - "type": "link_prediction", - "split_rate": { - "train": 0.8, - "val": 0.1, - "test": 0.1 - } - } - ] + } } ] }, @@ -164,27 +133,7 @@ def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict: ] }, "type": "paper", - "column": "node_id", - "features": [ - { - "column": "feat", - "name": "feat", - "transformation": { - "name": "no-op" - } - } - ], - "labels": [ - { - "column": "label", - "type": "classification", - "split_rate": { - "train": 0.8, - "val": 0.1, - "test": 0.1 - } - } - ] + "column": "node_id" } ], "edges": [ @@ -224,18 +173,7 @@ def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict: }, "relation": { "type": "citing" - }, - "labels": [ - { - "column": "", - "type": "link_prediction", - "split_rate": { - "train": 0.8, - "val": 0.1, - "test": 0.1 - } - } - ] + } } ] }, From b6876d9e200c75aef55145ba5894c902a114f68e Mon Sep 17 00:00:00 2001 From: jalencato Date: Tue, 5 Nov 2024 13:22:22 -0800 Subject: [PATCH 22/50] Update gconstruct_converter.py --- .../config/config_conversion/gconstruct_converter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py index ce615e7a6a..33fe40f760 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py +++ b/graphstorm-processing/graphstorm_processing/config/config_conversion/gconstruct_converter.py @@ -189,7 +189,6 @@ def _convert_feature(feats: list[Mapping[str, Any]]) -> list[dict]: "max_seq_length": gconstruct_transform_dict["max_seq_length"], } elif gconstruct_transform_dict["name"] == "edge_dst_hard_negative": - # Not check if it is link prediction task here gsp_transformation_dict["name"] = "edge_dst_hard_negative" if "separator" in gconstruct_transform_dict: gsp_transformation_dict["kwargs"] = { From bf1b0eb967fce7d3fc119b946f46f931b3bd1061 Mon Sep 17 00:00:00 2001 From: jalencato Date: Tue, 5 Nov 2024 13:22:54 -0800 Subject: [PATCH 23/50] Update hard_negative_configs.py --- .../graphstorm_processing/config/hard_negative_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py index cc732fe80c..42d63f9d40 100644 --- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py +++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py @@ -20,7 +20,7 @@ class HardNegativeConfig(FeatureConfig): - """Feature configuration for hard negative feature. Now only support link prediction + """Feature configuration for hard negative feature. Now only support link prediction. Supported kwargs ---------------- From 6d8ed9644f5a1b3540ec59a4506dcd2436cc0b31 Mon Sep 17 00:00:00 2001 From: jalencato Date: Tue, 5 Nov 2024 13:24:46 -0800 Subject: [PATCH 24/50] Update dist_feature_transformer.py --- .../data_transformations/dist_feature_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index 0b540d32fd..5e5c936c7d 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -51,7 +51,7 @@ def __init__( self.transformation: DistributedTransformation # We use this to re-apply transformations self.json_representation = json_representation - # Edge mapping file location + # Node Mapping Info for hard negative feature transformation self.edge_mapping_dict = edge_mapping_dict default_kwargs = { From 6eef513d494f04fd81cc0cdc12a953ad30054325 Mon Sep 17 00:00:00 2001 From: jalencato Date: Tue, 5 Nov 2024 13:25:41 -0800 Subject: [PATCH 25/50] Update dist_hard_negative_transformation.py --- .../dist_transformations/dist_hard_negative_transformation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index ad7e26987c..083bcb1924 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -42,7 +42,7 @@ def apply_transform( spark: SparkSession The spark session input_df : DataFrame - The input DataFrame to apply normalization to. + The input DataFrame to apply transformation to. edge_mapping_dict: dict The mapping dictionary contain mapping file directory and edge type """ From 7079f2cd6513806f58be66c185114c8548ac3d78 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 5 Nov 2024 21:27:19 +0000 Subject: [PATCH 26/50] add feature transformation --- .../dist_hard_negative_transformation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 083bcb1924..1bcb1f1465 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -36,13 +36,13 @@ def apply_transform( Parameters ---------- cols : Sequence[str] - List of column names to apply normalization to. + List of column names to apply normalization to separator: str, optional - The separator for string input value. Only required when input value type is string. + The separator for string input value. Only required when input value type is string spark: SparkSession The spark session input_df : DataFrame - The input DataFrame to apply transformation to. + The input DataFrame to apply transformation to edge_mapping_dict: dict The mapping dictionary contain mapping file directory and edge type """ @@ -86,7 +86,7 @@ class DistHardNegativeTransformation(DistributedTransformation): Parameters ---------- separator: str, optional - The separator for string input value. Only required when input value type is string. + The separator for string input value. Only required when input value type is string spark: SparkSession The spark session edge_mapping_dict: dict From 8c3c79ef06e11c5b411fc51c258f94f9bba51b34 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 6 Nov 2024 00:10:10 +0000 Subject: [PATCH 27/50] add dot --- .../dist_hard_negative_transformation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 1bcb1f1465..94877f065a 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -36,15 +36,15 @@ def apply_transform( Parameters ---------- cols : Sequence[str] - List of column names to apply normalization to + List of column names to apply normalization to. separator: str, optional - The separator for string input value. Only required when input value type is string + The separator for string input value. Only required when input value type is string. spark: SparkSession - The spark session + The spark session. input_df : DataFrame - The input DataFrame to apply transformation to + The input DataFrame to apply transformation to. edge_mapping_dict: dict - The mapping dictionary contain mapping file directory and edge type + The mapping dictionary contain mapping file directory and edge type. """ column_type = input_df.schema[cols[0]].dataType if isinstance(column_type, StringType): @@ -86,11 +86,11 @@ class DistHardNegativeTransformation(DistributedTransformation): Parameters ---------- separator: str, optional - The separator for string input value. Only required when input value type is string + The separator for string input value. Only required when input value type is string. spark: SparkSession - The spark session + The spark session. edge_mapping_dict: dict - The node type and mapping directory + The node type and mapping directory. """ def __init__( From a9dd308dcb622f573a002764fb953b2bf497345c Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 6 Nov 2024 00:20:47 +0000 Subject: [PATCH 28/50] hard negative config renaming --- .../graphstorm_processing/config/config_parser.py | 4 ++-- .../graphstorm_processing/config/hard_negative_configs.py | 2 +- .../data_transformations/dist_feature_transformer.py | 4 ++-- .../data_transformations/dist_transformations/__init__.py | 2 +- .../dist_hard_negative_transformation.py | 4 ++-- .../tests/test_dist_hard_negative_transformation.py | 6 +++--- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/config/config_parser.py b/graphstorm-processing/graphstorm_processing/config/config_parser.py index 15f323ced8..95f4ab3dd2 100644 --- a/graphstorm-processing/graphstorm_processing/config/config_parser.py +++ b/graphstorm-processing/graphstorm_processing/config/config_parser.py @@ -29,7 +29,7 @@ ) from .categorical_configs import MultiCategoricalFeatureConfig from .hf_configs import HFConfig -from .hard_negative_configs import HardNegativeConfig +from .hard_negative_configs import HardEdgeNegativeConfig from .data_config_base import DataStorageConfig @@ -73,7 +73,7 @@ def parse_feat_config(feature_dict: Dict) -> FeatureConfig: elif transformation_name == "huggingface": return HFConfig(feature_dict) elif transformation_name == "edge_dst_hard_negative": - return HardNegativeConfig(feature_dict) + return HardEdgeNegativeConfig(feature_dict) else: raise RuntimeError(f"Unknown transformation name: '{transformation_name}'") diff --git a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py index 42d63f9d40..26e99a00a6 100644 --- a/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py +++ b/graphstorm-processing/graphstorm_processing/config/hard_negative_configs.py @@ -19,7 +19,7 @@ from .feature_config_base import FeatureConfig -class HardNegativeConfig(FeatureConfig): +class HardEdgeNegativeConfig(FeatureConfig): """Feature configuration for hard negative feature. Now only support link prediction. Supported kwargs diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index 5e5c936c7d..f30a17b383 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -28,7 +28,7 @@ DistCategoryTransformation, DistMultiCategoryTransformation, DistHFTransformation, - DistHardNegativeTransformation, + DistHardEdgeNegativeTransformation, ) @@ -77,7 +77,7 @@ def __init__( elif feat_type == "huggingface": self.transformation = DistHFTransformation(**default_kwargs, **args_dict) elif feat_type == "edge_dst_hard_negative": - self.transformation = DistHardNegativeTransformation( + self.transformation = DistHardEdgeNegativeTransformation( **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict ) else: diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py index 5c74d4928a..959124644b 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/__init__.py @@ -15,4 +15,4 @@ ) from .dist_bucket_numerical_transformation import DistBucketNumericalTransformation from .dist_hf_transformation import DistHFTransformation -from .dist_hard_negative_transformation import DistHardNegativeTransformation +from .dist_hard_negative_transformation import DistHardEdgeNegativeTransformation diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 94877f065a..82a0816bde 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -80,7 +80,7 @@ def map_values(hard_neg_list): return transformed_df -class DistHardNegativeTransformation(DistributedTransformation): +class DistHardEdgeNegativeTransformation(DistributedTransformation): """Transformation to apply hard negative transformation. Parameters @@ -113,4 +113,4 @@ def apply(self, input_df: DataFrame) -> DataFrame: @staticmethod def get_transformation_name() -> str: - return "DistHardNegativeTransformation" + return "DistHardEdgeNegativeTransformation" diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py index cbc6b83687..0814384164 100755 --- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -21,7 +21,7 @@ from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT from graphstorm_processing.data_transformations.dist_transformations import ( - DistHardNegativeTransformation, + DistHardEdgeNegativeTransformation, ) @@ -51,7 +51,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa "mapping_path": f"{tmp_path}/raw_id_mappings/", "format_name": "parquet", } - hard_negative_transformation = DistHardNegativeTransformation( + hard_negative_transformation = DistHardEdgeNegativeTransformation( ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None ) output_df = hard_negative_transformation.apply(input_df) @@ -92,7 +92,7 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat "mapping_path": f"{tmp_path}/raw_id_mappings/", "format_name": "parquet", } - hard_negative_transformation = DistHardNegativeTransformation( + hard_negative_transformation = DistHardEdgeNegativeTransformation( ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";" ) output_df = hard_negative_transformation.apply(input_df) From cd319fa7e56cfd5a1928a601c935dfe84efe95c2 Mon Sep 17 00:00:00 2001 From: jalencato Date: Tue, 5 Nov 2024 16:47:48 -0800 Subject: [PATCH 29/50] Update constants.py --- graphstorm-processing/graphstorm_processing/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py index 06aa3b3d36..fc27b17686 100644 --- a/graphstorm-processing/graphstorm_processing/constants.py +++ b/graphstorm-processing/graphstorm_processing/constants.py @@ -62,7 +62,6 @@ NODE_MAPPING_STR = "orig" NODE_MAPPING_INT = "new" - ################# Supported execution envs ############## class ExecutionEnv(Enum): """Supported execution environments""" From 4f25cb9a72a8129f9c173372cf070dcb909cb57b Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 6 Nov 2024 01:00:36 +0000 Subject: [PATCH 30/50] add constant --- graphstorm-processing/graphstorm_processing/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py index fc27b17686..06aa3b3d36 100644 --- a/graphstorm-processing/graphstorm_processing/constants.py +++ b/graphstorm-processing/graphstorm_processing/constants.py @@ -62,6 +62,7 @@ NODE_MAPPING_STR = "orig" NODE_MAPPING_INT = "new" + ################# Supported execution envs ############## class ExecutionEnv(Enum): """Supported execution environments""" From ff7c470d211ab109797d1423211bb7b172137d11 Mon Sep 17 00:00:00 2001 From: jalencato Date: Mon, 11 Nov 2024 09:51:32 -0800 Subject: [PATCH 31/50] Apply suggestions from code review Co-authored-by: xiang song(charlie.song) --- docs/source/advanced/link-prediction.rst | 2 +- .../distributed/gsprocessing/input-configuration.rst | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/advanced/link-prediction.rst b/docs/source/advanced/link-prediction.rst index 3a68684e7e..f28f0ac542 100644 --- a/docs/source/advanced/link-prediction.rst +++ b/docs/source/advanced/link-prediction.rst @@ -274,7 +274,7 @@ In general, GraphStorm covers following cases: Preparing graph data for hard negative sampling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Now both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input. +Both single machine and distributed graph construction pipeline of GraphStorm provide support to load hard negative data from raw input. Hard destination negatives can be defined through ``edge_dst_hard_negative`` transformation. The ``feature_col`` field of ``edge_dst_hard_negative`` must stores the raw node ids of hard destination nodes. The following example shows how to define a hard negative feature for edges with the relation ``(node1, relation1, node1)``: diff --git a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst index ca5e85fdf5..d2074a338f 100644 --- a/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst +++ b/docs/source/cli/graph-construction/distributed/gsprocessing/input-configuration.rst @@ -495,9 +495,9 @@ arguments. - Encodes a hard negative edge feature for link prediction. For detail information for hard negative support, please refer to :ref:`hard_negative_sampling`. - ``kwargs``: - - ``separator`` (String, optional): Same as the one in the No-op operation, the separator is used to - split multiple input values for CSV files e.g. ``p0;s1``. If it is not provided, then the whole value - will be considered as a string. + - ``separator`` (String, optional): The separator is used to + split multiple values in an input string for data in CSV files e.g. ``p0;s1``. If it is not provided, then the whole value + will be treated as a single string. .. _gsprocessing-multitask-ref: From dbb8c0fe63b8705f911a9e73d695333babe20e38 Mon Sep 17 00:00:00 2001 From: jalencato Date: Mon, 11 Nov 2024 09:55:14 -0800 Subject: [PATCH 32/50] Apply suggestions from code review Co-authored-by: xiang song(charlie.song) --- .../dist_hard_negative_transformation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 82a0816bde..3bc2ec1868 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -51,8 +51,9 @@ def apply_transform( transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator)) else: transformed_df = input_df - # Edge type should be (src_ntype:get_relation_name():dst_ntype) - # Assume all the node type in the hard negative feature should be dst node type + # Edge type should be (src_ntype:relation_type:dst_ntype) + # Only support hard negative for destination nodes. Get the node type of destination nodes. + # TODO: support hard negative for source nodes. _, _, dst_type = edge_mapping_dict["edge_type"].split(":") mapping_prefix = edge_mapping_dict["mapping_path"] format_name = edge_mapping_dict["format_name"] From ad1c7864a3af36476236cf0e3d36babf37a09cc8 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 19:48:29 +0000 Subject: [PATCH 33/50] apply comment for GSProcessing --- .../dist_feature_transformer.py | 2 +- .../dist_hard_negative_transformation.py | 24 +++++++++++-------- .../dist_heterogeneous_loader.py | 17 ++++++------- graphstorm-processing/tests/test_converter.py | 4 ++-- .../test_dist_hard_negative_transformation.py | 15 ++++++++---- 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index f30a17b383..ec11de7ff5 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -78,7 +78,7 @@ def __init__( self.transformation = DistHFTransformation(**default_kwargs, **args_dict) elif feat_type == "edge_dst_hard_negative": self.transformation = DistHardEdgeNegativeTransformation( - **default_kwargs, **args_dict, spark=spark, edge_mapping_dict=edge_mapping_dict + **default_kwargs, **args_dict, spark=spark ) else: raise NotImplementedError( diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 3bc2ec1868..6ba99f9cd6 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -29,7 +29,7 @@ def apply_transform( separator: str, spark: SparkSession, input_df: DataFrame, - edge_mapping_dict: dict, + hard_node_mapping_dict: dict, ) -> DataFrame: """Applies hard negative transformation to each row. @@ -43,7 +43,7 @@ def apply_transform( The spark session. input_df : DataFrame The input DataFrame to apply transformation to. - edge_mapping_dict: dict + hard_node_mapping_dict: dict The mapping dictionary contain mapping file directory and edge type. """ column_type = input_df.schema[cols[0]].dataType @@ -54,9 +54,9 @@ def apply_transform( # Edge type should be (src_ntype:relation_type:dst_ntype) # Only support hard negative for destination nodes. Get the node type of destination nodes. # TODO: support hard negative for source nodes. - _, _, dst_type = edge_mapping_dict["edge_type"].split(":") - mapping_prefix = edge_mapping_dict["mapping_path"] - format_name = edge_mapping_dict["format_name"] + _, _, dst_type = hard_node_mapping_dict["edge_type"].split(":") + mapping_prefix = hard_node_mapping_dict["mapping_path"] + format_name = hard_node_mapping_dict["format_name"] hard_negative_node_mapping = spark.read.parquet( f"{mapping_prefix}{dst_type}/{format_name}/*.parquet" ) @@ -90,24 +90,28 @@ class DistHardEdgeNegativeTransformation(DistributedTransformation): The separator for string input value. Only required when input value type is string. spark: SparkSession The spark session. - edge_mapping_dict: dict + hard_node_mapping_dict: dict The node type and mapping directory. """ def __init__( - self, cols: Sequence[str], spark: SparkSession, separator: str = "", edge_mapping_dict=None + self, + cols: Sequence[str], + spark: SparkSession, + separator: str = "", + hard_node_mapping_dict=None, ) -> None: super().__init__(cols, spark) self.cols = cols assert len(self.cols) == 1, "Hard Negative Transformation only supports single column" self.separator = separator - self.edge_mapping_dict = edge_mapping_dict - assert self.edge_mapping_dict, "edge mapping dict cannot be None for hard negative " + self.hard_node_mapping_dict = hard_node_mapping_dict + assert self.hard_node_mapping_dict, "edge mapping dict cannot be None for hard negative " def apply(self, input_df: DataFrame) -> DataFrame: assert self.spark transformed_df = apply_transform( - self.cols, self.separator, self.spark, input_df, self.edge_mapping_dict + self.cols, self.separator, self.spark, input_df, self.hard_node_mapping_dict ) return transformed_df diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index 476688776d..07ce95bf2a 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -1655,14 +1655,15 @@ def _process_edge_features( .get(feat_conf.feat_name, {}) ) # Hard Negative Transformation use case, but should be able to be reused - edge_mapping_dict = { - "edge_type": edge_type, - "mapping_path": f"{self.output_prefix}/raw_id_mappings/", - "format_name": FORMAT_NAME, - } - transformer = DistFeatureTransformer( - feat_conf, self.spark, json_representation, edge_mapping_dict - ) + if feat_conf.feat_type == "edge_dst_hard_negative": + hard_node_mapping_dict = { + "edge_type": edge_type, + "mapping_path": f"{self.output_prefix}/raw_id_mappings/", + "format_name": FORMAT_NAME, + } + feat_conf.transformation_kwargs["hard_node_mapping_dict"] = hard_node_mapping_dict + + transformer = DistFeatureTransformer(feat_conf, self.spark, json_representation) if json_representation: logging.info( diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py index 30b04f3855..26b837c9a1 100644 --- a/graphstorm-processing/tests/test_converter.py +++ b/graphstorm-processing/tests/test_converter.py @@ -406,7 +406,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter): { "feature_col": ["author"], "feature_name": "hard_negative", - "transform": {"name": "edge_dst_hard_negative"}, + "transform": {"name": "edge_dst_hard_negative", "separator": ";"}, }, ], "labels": [ @@ -516,7 +516,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter): { "column": "author", "name": "hard_negative", - "transformation": {"name": "edge_dst_hard_negative"}, + "transformation": {"name": "edge_dst_hard_negative", "separator": ";"}, }, ] assert edges_output["labels"] == [ diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py index 0814384164..d2aa6ddece 100755 --- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -46,22 +46,26 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT] mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column) mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet") - edge_mapping_dict = { + hard_node_mapping_dict = { "edge_type": "src_type:relation:dst_type", "mapping_path": f"{tmp_path}/raw_id_mappings/", "format_name": "parquet", } hard_negative_transformation = DistHardEdgeNegativeTransformation( - ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=None + ["hard_negative"], + spark=spark, + hard_node_mapping_dict=hard_node_mapping_dict, + separator=None, ) output_df = hard_negative_transformation.apply(input_df) check_df_schema(output_df) output_data = output_df.collect() + # Length should be 4 for each tensor because there are 4 distinct nodes for dst node expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] for idx, row in enumerate(output_data): - np.testing.assert_almost_equal( + np.testing.assert_equal( row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" ) @@ -93,15 +97,16 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat "format_name": "parquet", } hard_negative_transformation = DistHardEdgeNegativeTransformation( - ["hard_negative"], spark=spark, edge_mapping_dict=edge_mapping_dict, separator=";" + ["hard_negative"], spark=spark, hard_node_mapping_dict=hard_node_mapping_dict, separator=";" ) output_df = hard_negative_transformation.apply(input_df) check_df_schema(output_df) output_data = output_df.collect() + # Length should be 4 for each tensor because there are 4 distinct nodes for dst node expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] for idx, row in enumerate(output_data): - np.testing.assert_almost_equal( + np.testing.assert_equal( row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" ) From 2e47f2d2c8a574a3bf08c3cdcd143f4c23981354 Mon Sep 17 00:00:00 2001 From: jalencato Date: Mon, 11 Nov 2024 11:49:33 -0800 Subject: [PATCH 34/50] Apply suggestions from code review Co-authored-by: xiang song(charlie.song) --- python/graphstorm/gpartition/post_hard_negative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index be77cead5e..a739f20bf6 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -56,7 +56,7 @@ def load_hard_negative_config(gsprocessing_config): def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): """Shuffle hard negative edge feature ids with int-to-int node id mapping. The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils. - Create an additional function to handle the id mappings under distributed setting. + Create an additional function to handle the id mappings under the distributed setting. Parameters ---------------- From e631d2c90ae3b763dc55caae0d0ef3e8ea272227 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 20:50:35 +0000 Subject: [PATCH 35/50] apply comments for gspartition --- graphstorm-processing/tests/test_converter.py | 2 +- .../test_dist_hard_negative_transformation.py | 6 +- .../gpartition/dist_partition_graph.py | 27 +-- .../gpartition/post_hard_negative.py | 14 +- python/graphstorm/model/utils.py | 14 +- .../gsprocessing_hard_negative_config.json | 79 ++++++++ ...gsprocessing_non_hard_negative_config.json | 67 +++++++ .../test_hard_negative_post_partition.py | 181 +----------------- 8 files changed, 186 insertions(+), 204 deletions(-) create mode 100644 tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json create mode 100644 tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json diff --git a/graphstorm-processing/tests/test_converter.py b/graphstorm-processing/tests/test_converter.py index 26b837c9a1..a4871342d2 100644 --- a/graphstorm-processing/tests/test_converter.py +++ b/graphstorm-processing/tests/test_converter.py @@ -516,7 +516,7 @@ def test_convert_gsprocessing(converter: GConstructConfigConverter): { "column": "author", "name": "hard_negative", - "transformation": {"name": "edge_dst_hard_negative", "separator": ";"}, + "transformation": {"name": "edge_dst_hard_negative", "kwargs": {"separator": ";"}}, }, ] assert edges_output["labels"] == [ diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py index d2aa6ddece..179a65dc3c 100755 --- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -66,7 +66,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa for idx, row in enumerate(output_data): np.testing.assert_equal( - row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" + row[0], expected_output[idx], err_msg=f"Row {idx} is not equal" ) @@ -91,7 +91,7 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat mapping_column = [NODE_MAPPING_STR, NODE_MAPPING_INT] mapping_df = spark.createDataFrame(mapping_data, schema=mapping_column) mapping_df.repartition(1).write.parquet(f"{tmp_path}/raw_id_mappings/dst_type/parquet") - edge_mapping_dict = { + hard_node_mapping_dict = { "edge_type": "src_type:relation:dst_type", "mapping_path": f"{tmp_path}/raw_id_mappings/", "format_name": "parquet", @@ -108,5 +108,5 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat for idx, row in enumerate(output_data): np.testing.assert_equal( - row[0], expected_output[idx], decimal=3, err_msg=f"Row {idx} is not equal" + row[0], expected_output[idx], err_msg=f"Row {idx} is not equal" ) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index 6a4e938b16..58ed0a57a5 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -191,22 +191,15 @@ def main(): ) # Hard Negative Mapping - if args.gsprocessing_config: - gsprocessing_config = args.gsprocessing_config - shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", - args.num_parts, args.output_path) - else: - for filename in os.listdir(args.input_path): - if filename.endswith("_with_transformations.json"): - gsprocessing_config = filename - shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", - args.num_parts, args.output_path) - break - else: - # Did not raise error here for not introducing the break change, - # but will raise warning here to warn customers. - logging.info("Skip the hard negative node ID mapping, " - "please upgrade to the latest GSProcessing.") + # Load GSProcessing config from launch_arguments generated by GSProcessing + # Generated GSProcessing config will have _with_transformation suffix. + with open(os.path.join(args.input_path, "launch_arguments.json"), + "r", encoding="utf-8") as f: + gsprocessing_launch_arguments: Dict = json.load(f) + gsprocessing_config = gsprocessing_launch_arguments["config_filename"] + gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json") + shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", + args.num_parts, args.output_path) def parse_args() -> argparse.Namespace: @@ -215,8 +208,6 @@ def parse_args() -> argparse.Namespace: + "or regression tasks") argparser.add_argument("--input-path", type=str, required=True, help="Path to input DGL chunked data.") - argparser.add_argument("--gsprocessing-config", type=str, - help="Path to the input GSProcessing config data.") argparser.add_argument("--metadata-filename", type=str, default="metadata.json", help="Name for the chunked DGL data metadata file.") argparser.add_argument("--output-path", type=str, required=True, diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index a739f20bf6..45525894a0 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -35,7 +35,7 @@ def load_hard_negative_config(gsprocessing_config): # Hard Negative only supports link prediction edges_config = config['graph']['edges'] - mapping_edge_list = [] + hard_neg_list = [] for single_edge_config in edges_config: if "features" not in single_edge_config: continue @@ -47,13 +47,13 @@ def load_hard_negative_config(gsprocessing_config): single_edge_config["relation"]["type"], single_edge_config["dest"]["type"]]) hard_neg_feat_name = single_feature['name'] - mapping_edge_list.append({"dst_node_type": single_edge_config["dest"]["type"], + hard_neg_list.append({"dst_node_type": single_edge_config["dest"]["type"], "edge_type": edge_type, "hard_neg_feat_name": hard_neg_feat_name}) - return mapping_edge_list + return hard_neg_list -def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): +def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path): """Shuffle hard negative edge feature ids with int-to-int node id mapping. The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils. Create an additional function to handle the id mappings under the distributed setting. @@ -64,7 +64,7 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): Path to the gsprocessing config. num_parts: int Number of parts. - output_path: str + graph_path: str Path to the output DGL graph. """ shuffled_edge_config = load_hard_negative_config(gsprocessing_config) @@ -73,7 +73,7 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, output_path): for single_shuffled_edge_config in shuffled_edge_config: node_type = single_shuffled_edge_config["dst_node_type"] node_type_list.append(node_type) - node_mapping = load_dist_nid_map(f"{output_path}/dist_graph", node_type_list) + node_mapping = load_dist_nid_map(f"{graph_path}/dist_graph", node_type_list) gnid2pnid_mapping = {} def get_gnid2pnid_map(ntype): @@ -89,7 +89,7 @@ def get_gnid2pnid_map(ntype): # iterate all the partitions to convert hard negative node ids. for i in range(num_parts): - part_path = os.path.join(f"{output_path}/dist_graph", f"part{i}") + part_path = os.path.join(f"{graph_path}/dist_graph", f"part{i}") edge_feat_path = os.path.join(part_path, "edge_feat.dgl") # load edge features first diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index 3d12ec3458..410bc6eac3 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -394,7 +394,19 @@ def _exchange_node_id_mapping(rank, world_size, device, return gather_list[0].to(th.device("cpu")) def load_dist_nid_map(node_id_mapping_file, ntypes): - """ Wrapper for load_dist_nid_map. + """ Load id mapping files in dist partition format. + + Parameters + ---------- + node_id_mapping_file: str + Node mapping directory. + ntypes: list[str] + List of node types. + + Return + ------ + id_mappings: dict + Node mapping dictionary. """ return _load_dist_nid_map(node_id_mapping_file, ntypes) diff --git a/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json b/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json new file mode 100644 index 0000000000..0d56a2cf3a --- /dev/null +++ b/tests/unit-tests/gpartition/config/gsprocessing_hard_negative_config.json @@ -0,0 +1,79 @@ +{ + "graph": { + "nodes": [ + { + "data": { + "format": "parquet", + "files": [ + "./nodes/author.parquet" + ] + }, + "type": "author", + "column": "node_id" + }, + { + "data": { + "format": "parquet", + "files": [ + "./nodes/paper.parquet" + ] + }, + "type": "paper", + "column": "node_id" + } + ], + "edges": [ + { + "data": { + "format": "parquet", + "files": [ + "./edges/author_writing_paper_hard_negative.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "author" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "writing" + }, + "features": [ + { + "column": "hard_neg", + "name": "hard_neg_feat", + "transformation": { + "name": "edge_dst_hard_negative", + "kwargs": { + "separator": ";" + } + } + } + ] + }, + { + "data": { + "format": "parquet", + "files": [ + "./edges/paper_citing_paper.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "paper" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "citing" + } + } + ] + }, + "version": "gsprocessing-v1.0" +} \ No newline at end of file diff --git a/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json b/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json new file mode 100644 index 0000000000..daf5122113 --- /dev/null +++ b/tests/unit-tests/gpartition/config/gsprocessing_non_hard_negative_config.json @@ -0,0 +1,67 @@ +{ + "graph": { + "nodes": [ + { + "data": { + "format": "parquet", + "files": [ + "./nodes/author.parquet" + ] + }, + "type": "author", + "column": "node_id" + }, + { + "data": { + "format": "parquet", + "files": [ + "./nodes/paper.parquet" + ] + }, + "type": "paper", + "column": "node_id" + } + ], + "edges": [ + { + "data": { + "format": "parquet", + "files": [ + "./edges/author_writing_paper_hard_negative.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "author" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "writing" + } + }, + { + "data": { + "format": "parquet", + "files": [ + "./edges/paper_citing_paper.parquet" + ] + }, + "source": { + "column": "source_id", + "type": "paper" + }, + "dest": { + "column": "dest_id", + "type": "paper" + }, + "relation": { + "type": "citing" + } + } + ] + }, + "version": "gsprocessing-v1.0" +} \ No newline at end of file diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py index 1d65ce7d86..dc21115ead 100644 --- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py +++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py @@ -27,194 +27,27 @@ from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids, load_hard_negative_config) -@pytest.fixture(scope="module", name="gsprocessing_hard_negative_config") -def gsprocessing_config_hard_negative_dict_fixture() -> Dict: - return{ - "graph": { - "nodes": [ - { - "data": { - "format": "parquet", - "files": [ - "./nodes/author.parquet" - ] - }, - "type": "author", - "column": "node_id", - }, - { - "data": { - "format": "parquet", - "files": [ - "./nodes/paper.parquet" - ] - }, - "type": "paper", - "column": "node_id" - } - ], - "edges": [ - { - "data": { - "format": "parquet", - "files": [ - "./edges/author_writing_paper_hard_negative.parquet" - ] - }, - "source": { - "column": "source_id", - "type": "author" - }, - "dest": { - "column": "dest_id", - "type": "paper" - }, - "relation": { - "type": "writing" - }, - "features": [ - { - "column": "hard_neg", - "name": "hard_neg_feat", - "transformation": { - "name": "edge_dst_hard_negative", - "kwargs": { - "separator": ";" - } - } - } - ] - }, - { - "data": { - "format": "parquet", - "files": [ - "./edges/paper_citing_paper.parquet" - ] - }, - "source": { - "column": "source_id", - "type": "paper" - }, - "dest": { - "column": "dest_id", - "type": "paper" - }, - "relation": { - "type": "citing" - } - } - ] - }, - "version": "gsprocessing-v1.0" - } - -@pytest.fixture(scope="module", name="gsprocessing_non_hard_negative_config") -def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict: - return{ - "graph": { - "nodes": [ - { - "data": { - "format": "parquet", - "files": [ - "./nodes/author.parquet" - ] - }, - "type": "author", - "column": "node_id", - }, - { - "data": { - "format": "parquet", - "files": [ - "./nodes/paper.parquet" - ] - }, - "type": "paper", - "column": "node_id" - } - ], - "edges": [ - { - "data": { - "format": "parquet", - "files": [ - "./edges/author_writing_paper_hard_negative.parquet" - ] - }, - "source": { - "column": "source_id", - "type": "author" - }, - "dest": { - "column": "dest_id", - "type": "paper" - }, - "relation": { - "type": "writing" - } - }, - { - "data": { - "format": "parquet", - "files": [ - "./edges/paper_citing_paper.parquet" - ] - }, - "source": { - "column": "source_id", - "type": "paper" - }, - "dest": { - "column": "dest_id", - "type": "paper" - }, - "relation": { - "type": "citing" - } - } - ] - }, - "version": "gsprocessing-v1.0" - } - - -def test_load_hard_negative_config(tmp_path, gsprocessing_hard_negative_config: Dict, - gsprocessing_non_hard_negative_config: Dict): - # For config with gsprocessing_config.json - json_file_path = f"{tmp_path}/gsprocessing_config.json" - - # Write the dictionary to the JSON file - with open(json_file_path, 'w') as json_file: - json.dump(gsprocessing_hard_negative_config, json_file, indent=4) +def test_load_hard_negative_config(): + # For config with hard negative transformation + json_file_path = f"./config/gsprocessing_hard_negative_config.json" res = load_hard_negative_config(json_file_path) assert res[0] == {'dst_node_type': 'paper', 'edge_type': 'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'} - # For config without hard negative feature definition - json_file_path = f"{tmp_path}/gsprocessing_config.json" - - # Write the dictionary to the JSON file - with open(json_file_path, 'w') as json_file: - json.dump(gsprocessing_non_hard_negative_config, - json_file, indent=4) + # For config without hard negative transformation + json_file_path = f"./config/gsprocessing_non_hard_negative_config.json" res = load_hard_negative_config(json_file_path) assert res == [] -def test_shuffle_hard_negative_nids(tmp_path, gsprocessing_hard_negative_config: Dict): +def test_shuffle_hard_negative_nids(tmp_path): # For config with gsprocessing_config.json - json_file_path = f"{tmp_path}/gsprocessing_config.json" - - # Write the dictionary to the JSON file - with open(json_file_path, 'w') as json_file: - json.dump(gsprocessing_hard_negative_config, json_file, indent=4) + json_file_path = f"./config/gsprocessing_hard_negative_config.json" # Generate dgl graph partitioned_graph = f"{tmp_path}/partitioned_graph" From ca462117ecb1cf26125f3a66c656df93fe38f112 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 20:59:51 +0000 Subject: [PATCH 36/50] apply comment --- .../data_transformations/dist_feature_transformer.py | 3 --- .../tests/test_dist_hard_negative_transformation.py | 8 ++------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index ec11de7ff5..a42613b26a 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -43,7 +43,6 @@ def __init__( feature_config: FeatureConfig, spark: SparkSession, json_representation: dict, - edge_mapping_dict: dict = None, ): feat_type = feature_config.feat_type feat_name = feature_config.feat_name @@ -51,8 +50,6 @@ def __init__( self.transformation: DistributedTransformation # We use this to re-apply transformations self.json_representation = json_representation - # Node Mapping Info for hard negative feature transformation - self.edge_mapping_dict = edge_mapping_dict default_kwargs = { "cols": feature_config.cols, diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py index 179a65dc3c..f153301eb7 100755 --- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -65,9 +65,7 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] for idx, row in enumerate(output_data): - np.testing.assert_equal( - row[0], expected_output[idx], err_msg=f"Row {idx} is not equal" - ) + np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal") def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_path): @@ -107,6 +105,4 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] for idx, row in enumerate(output_data): - np.testing.assert_equal( - row[0], expected_output[idx], err_msg=f"Row {idx} is not equal" - ) + np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal") From eff82aa5e5077ce51b741d169429ab986c43717a Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 21:01:58 +0000 Subject: [PATCH 37/50] roll back --- .../data_transformations/dist_feature_transformer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py index a42613b26a..306b21aaaa 100644 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py @@ -39,10 +39,7 @@ class DistFeatureTransformer(object): """ def __init__( - self, - feature_config: FeatureConfig, - spark: SparkSession, - json_representation: dict, + self, feature_config: FeatureConfig, spark: SparkSession, json_representation: dict ): feat_type = feature_config.feat_type feat_name = feature_config.feat_name From 9dd9ff5f58547209e4a96b337aff07111098c56b Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 21:04:08 +0000 Subject: [PATCH 38/50] apply comment --- .../dist_transformations/dist_hard_negative_transformation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 6ba99f9cd6..131269d0c8 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -91,7 +91,7 @@ class DistHardEdgeNegativeTransformation(DistributedTransformation): spark: SparkSession The spark session. hard_node_mapping_dict: dict - The node type and mapping directory. + The mapping dictionary contain mapping file directory and edge type. """ def __init__( From 1ab7f6d2f1c510a8528a325526f899b5baed847f Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 21:39:12 +0000 Subject: [PATCH 39/50] fix test --- .../gpartition/test_hard_negative_post_partition.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py index dc21115ead..e8616c3280 100644 --- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py +++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py @@ -30,7 +30,8 @@ def test_load_hard_negative_config(): # For config with hard negative transformation - json_file_path = f"./config/gsprocessing_hard_negative_config.json" + json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" + f"config/gsprocessing_hard_negative_config.json") res = load_hard_negative_config(json_file_path) @@ -38,7 +39,8 @@ def test_load_hard_negative_config(): 'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'} # For config without hard negative transformation - json_file_path = f"./config/gsprocessing_non_hard_negative_config.json" + json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" + f"config/gsprocessing_non_hard_negative_config.json") res = load_hard_negative_config(json_file_path) @@ -47,7 +49,8 @@ def test_load_hard_negative_config(): def test_shuffle_hard_negative_nids(tmp_path): # For config with gsprocessing_config.json - json_file_path = f"./config/gsprocessing_hard_negative_config.json" + json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" + f"config/gsprocessing_hard_negative_config.json") # Generate dgl graph partitioned_graph = f"{tmp_path}/partitioned_graph" From f93dff2ed2789c8d243d3422a8cc3bc68aedcdcc Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 22:48:16 +0000 Subject: [PATCH 40/50] check existense for launch_arguments.json --- .../gpartition/dist_partition_graph.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index 58ed0a57a5..62b319254a 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -193,13 +193,17 @@ def main(): # Hard Negative Mapping # Load GSProcessing config from launch_arguments generated by GSProcessing # Generated GSProcessing config will have _with_transformation suffix. - with open(os.path.join(args.input_path, "launch_arguments.json"), - "r", encoding="utf-8") as f: - gsprocessing_launch_arguments: Dict = json.load(f) - gsprocessing_config = gsprocessing_launch_arguments["config_filename"] - gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json") - shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", - args.num_parts, args.output_path) + if os.path.exists(args.input_path, "launch_arguments.json"): + with open(os.path.join(args.input_path, "launch_arguments.json"), + "r", encoding="utf-8") as f: + gsprocessing_launch_arguments: Dict = json.load(f) + gsprocessing_config = gsprocessing_launch_arguments["config_filename"] + gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json") + shuffle_hard_negative_nids(f"{args.input_path}/{gsprocessing_config}", + args.num_parts, args.output_path) + else: + logging.info("Skip the hard negative node ID mapping, " + "the processed data is not generated by GSProcessing.") def parse_args() -> argparse.Namespace: From b3760d8aaf6de546131ec29e2672b56ef2fc4245 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Mon, 11 Nov 2024 22:58:31 +0000 Subject: [PATCH 41/50] check existense for launch_arguments.json --- python/graphstorm/gpartition/dist_partition_graph.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/gpartition/dist_partition_graph.py b/python/graphstorm/gpartition/dist_partition_graph.py index 62b319254a..fffddbe738 100644 --- a/python/graphstorm/gpartition/dist_partition_graph.py +++ b/python/graphstorm/gpartition/dist_partition_graph.py @@ -193,9 +193,9 @@ def main(): # Hard Negative Mapping # Load GSProcessing config from launch_arguments generated by GSProcessing # Generated GSProcessing config will have _with_transformation suffix. - if os.path.exists(args.input_path, "launch_arguments.json"): - with open(os.path.join(args.input_path, "launch_arguments.json"), - "r", encoding="utf-8") as f: + launch_arguments_path = os.path.join(args.input_path, "launch_arguments.json") + if os.path.exists(launch_arguments_path): + with open(launch_arguments_path, "r", encoding="utf-8") as f: gsprocessing_launch_arguments: Dict = json.load(f) gsprocessing_config = gsprocessing_launch_arguments["config_filename"] gsprocessing_config = gsprocessing_config.replace(".json", "_with_transformations.json") From 32f3412c72833b8fec3ef2671667cf298af56682 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 02:03:00 +0000 Subject: [PATCH 42/50] change gsprocessing part --- .../graphstorm_processing/constants.py | 4 ++ .../dist_hard_negative_transformation.py | 41 ++++++++++++------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/constants.py b/graphstorm-processing/graphstorm_processing/constants.py index 06aa3b3d36..a732306ab8 100644 --- a/graphstorm-processing/graphstorm_processing/constants.py +++ b/graphstorm-processing/graphstorm_processing/constants.py @@ -58,6 +58,10 @@ HUGGINGFACE_TOKENIZE = "tokenize_hf" HUGGINGFACE_EMB = "embedding_hf" +################# Hard Negative transformations ################ +ORDER_INDEX = "hard_negative_order_id" +EXPLODE_HARD_NEGATIVE_VALUE = "hard_negative_exploded_single_value" + ################# Node Mapping ################ NODE_MAPPING_STR = "orig" NODE_MAPPING_INT = "new" diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 131269d0c8..ffd73c5ec0 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -19,7 +19,12 @@ from pyspark.sql.types import ArrayType, IntegerType, StringType from pyspark.sql import DataFrame, functions as F, SparkSession -from graphstorm_processing.constants import NODE_MAPPING_STR, NODE_MAPPING_INT +from graphstorm_processing.constants import ( + NODE_MAPPING_STR, + NODE_MAPPING_INT, + ORDER_INDEX, + EXPLODE_HARD_NEGATIVE_VALUE, +) from .base_dist_transformation import DistributedTransformation @@ -62,21 +67,29 @@ def apply_transform( ) node_mapping_length = hard_negative_node_mapping.count() - # TODO: This method may suffer from scalability issue, - # we can make this method to join-based solution. - hard_negative_node_mapping_dict = { - row[NODE_MAPPING_STR]: row[NODE_MAPPING_INT] for row in hard_negative_node_mapping.collect() - } + # TODO: Use panda series to possibly improve the efficiency + transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id()) + transformed_df = transformed_df.withColumn( + EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0])) + ) + transformed_df = transformed_df.join( + hard_negative_node_mapping, + transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR], + "inner", + ).select(NODE_MAPPING_INT, ORDER_INDEX) + transformed_df = transformed_df.groupBy(ORDER_INDEX).agg( + F.collect_list(NODE_MAPPING_INT).alias(cols[0]) + ) # Same length for feature to convert to tensor - def map_values(hard_neg_list): - mapped_values = [hard_negative_node_mapping_dict.get(item, -1) for item in hard_neg_list] - while len(mapped_values) < node_mapping_length: - mapped_values.append(-1) - return mapped_values - - map_values_udf = F.udf(map_values, ArrayType(IntegerType())) - transformed_df = transformed_df.select(map_values_udf(F.col(cols[0])).alias(cols[0])) + def pad_mapped_values(hard_neg_list): + while len(hard_neg_list) < node_mapping_length: + hard_neg_list.append(-1) + return hard_neg_list + + pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType())) + transformed_df = transformed_df.orderBy(ORDER_INDEX) + transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0])) return transformed_df From c61bdc958d8f261fe284bea8d83f265f509ba424 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 02:54:51 +0000 Subject: [PATCH 43/50] comment --- .../dist_transformations/dist_hard_negative_transformation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index ffd73c5ec0..c0dc715ad9 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -68,6 +68,7 @@ def apply_transform( node_mapping_length = hard_negative_node_mapping.count() # TODO: Use panda series to possibly improve the efficiency + # Explode the original list and join node id mapping dataframe transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id()) transformed_df = transformed_df.withColumn( EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0])) @@ -88,6 +89,7 @@ def pad_mapped_values(hard_neg_list): return hard_neg_list pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType())) + # Make sure it keeps the original order transformed_df = transformed_df.orderBy(ORDER_INDEX) transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0])) From 90fcd751ce896c0c238b70fb9e52b368635affa7 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 19:47:26 +0000 Subject: [PATCH 44/50] hard negative processing --- .../dist_hard_negative_transformation.py | 124 ++++++++---------- python/graphstorm/gconstruct/utils.py | 36 +++-- .../gpartition/post_hard_negative.py | 15 +-- python/graphstorm/model/utils.py | 13 +- .../test_hard_negative_post_partition.py | 57 ++++---- 5 files changed, 117 insertions(+), 128 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index c0dc715ad9..5ccee36a81 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -29,92 +29,35 @@ from .base_dist_transformation import DistributedTransformation -def apply_transform( - cols: Sequence[str], - separator: str, - spark: SparkSession, - input_df: DataFrame, - hard_node_mapping_dict: dict, -) -> DataFrame: - """Applies hard negative transformation to each row. +class DistHardEdgeNegativeTransformation(DistributedTransformation): + """Transformation to apply hard negative transformation. Parameters ---------- cols : Sequence[str] - List of column names to apply normalization to. - separator: str, optional - The separator for string input value. Only required when input value type is string. + List of column names to apply hard negative transformation to. spark: SparkSession The spark session. - input_df : DataFrame - The input DataFrame to apply transformation to. hard_node_mapping_dict: dict The mapping dictionary contain mapping file directory and edge type. - """ - column_type = input_df.schema[cols[0]].dataType - if isinstance(column_type, StringType): - transformed_df = input_df.withColumn(cols[0], split(col(cols[0]), separator)) - else: - transformed_df = input_df - # Edge type should be (src_ntype:relation_type:dst_ntype) - # Only support hard negative for destination nodes. Get the node type of destination nodes. - # TODO: support hard negative for source nodes. - _, _, dst_type = hard_node_mapping_dict["edge_type"].split(":") - mapping_prefix = hard_node_mapping_dict["mapping_path"] - format_name = hard_node_mapping_dict["format_name"] - hard_negative_node_mapping = spark.read.parquet( - f"{mapping_prefix}{dst_type}/{format_name}/*.parquet" - ) - node_mapping_length = hard_negative_node_mapping.count() - - # TODO: Use panda series to possibly improve the efficiency - # Explode the original list and join node id mapping dataframe - transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id()) - transformed_df = transformed_df.withColumn( - EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(cols[0])) - ) - transformed_df = transformed_df.join( - hard_negative_node_mapping, - transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR], - "inner", - ).select(NODE_MAPPING_INT, ORDER_INDEX) - transformed_df = transformed_df.groupBy(ORDER_INDEX).agg( - F.collect_list(NODE_MAPPING_INT).alias(cols[0]) - ) - - # Same length for feature to convert to tensor - def pad_mapped_values(hard_neg_list): - while len(hard_neg_list) < node_mapping_length: - hard_neg_list.append(-1) - return hard_neg_list - - pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType())) - # Make sure it keeps the original order - transformed_df = transformed_df.orderBy(ORDER_INDEX) - transformed_df = transformed_df.select(pad_value_udf(F.col(cols[0])).alias(cols[0])) - - return transformed_df - - -class DistHardEdgeNegativeTransformation(DistributedTransformation): - """Transformation to apply hard negative transformation. - - Parameters - ---------- + { + "edge_type": str + Edge type to apply hard negative transformation. + "mapping_path": str + Path to the raw node mapping. + "format_name": str + Parquet. + } separator: str, optional The separator for string input value. Only required when input value type is string. - spark: SparkSession - The spark session. - hard_node_mapping_dict: dict - The mapping dictionary contain mapping file directory and edge type. """ def __init__( self, cols: Sequence[str], spark: SparkSession, + hard_node_mapping_dict: dict, separator: str = "", - hard_node_mapping_dict=None, ) -> None: super().__init__(cols, spark) self.cols = cols @@ -125,9 +68,48 @@ def __init__( def apply(self, input_df: DataFrame) -> DataFrame: assert self.spark - transformed_df = apply_transform( - self.cols, self.separator, self.spark, input_df, self.hard_node_mapping_dict + input_col = self.cols[0] + column_type = input_df.schema[input_col].dataType + if isinstance(column_type, StringType): + transformed_df = input_df.withColumn(input_col, split(col(input_col), self.separator)) + else: + transformed_df = input_df + # Edge type should be (src_ntype:relation_type:dst_ntype) + # Only support hard negative for destination nodes. Get the node type of destination nodes. + # TODO: support hard negative for source nodes. + _, _, dst_type = self.hard_node_mapping_dict["edge_type"].split(":") + mapping_prefix = self.hard_node_mapping_dict["mapping_path"] + format_name = self.hard_node_mapping_dict["format_name"] + hard_negative_node_mapping = self.spark.read.parquet( + f"{mapping_prefix}{dst_type}/{format_name}/" ) + node_mapping_length = hard_negative_node_mapping.count() + + # TODO: Use panda series to possibly improve the efficiency + # Explode the original list and join node id mapping dataframe + transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id()) + transformed_df = transformed_df.withColumn( + EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(input_col)) + ) + transformed_df = transformed_df.join( + hard_negative_node_mapping, + transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR], + "inner", + ).select(NODE_MAPPING_INT, ORDER_INDEX) + transformed_df = transformed_df.groupBy(ORDER_INDEX).agg( + F.collect_list(NODE_MAPPING_INT).alias(input_col) + ) + + # Same length for feature to convert to tensor + def pad_mapped_values(hard_neg_list): + if len(hard_neg_list) < node_mapping_length: + hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list))) + return hard_neg_list + + pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType())) + # Make sure it keeps the original order + transformed_df = transformed_df.orderBy(ORDER_INDEX) + transformed_df = transformed_df.select(pad_value_udf(F.col(input_col)).alias(input_col)) return transformed_df diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index e921293c2d..a9b60fd872 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -1118,6 +1118,28 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops): return hard_edge_neg_feats +def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping): + """ Get global nid to partitioned nid mapping + + Parameters + ---------- + ntype: str + Path to the directory storing the partitioned graph. + node_mapping: dict of list + Dict of mapping. {ntype: mapping} + gnid2pnid_mapping: dict + Dict of mapping from global nid to partitioned id mapping. + """ + if ntype in gnid2pnid_mapping: + return gnid2pnid_mapping[ntype] + else: + pnid2gnid_map = node_mapping[ntype] + gnid2pnid_map = th.argsort(pnid2gnid_map) + gnid2pnid_mapping[ntype] = gnid2pnid_map + # del ntype in node_mapping to save memory + del node_mapping[ntype] + return gnid2pnid_mapping[ntype] + def shuffle_hard_nids(data_path, num_parts, hard_edge_neg_feats): """ Shuffle node ids of hard negatives from Graph node id space to Partition Node id space. @@ -1136,17 +1158,6 @@ def shuffle_hard_nids(data_path, num_parts, hard_edge_neg_feats): node_mapping = load_maps(data_path, "node_mapping") gnid2pnid_mapping = {} - def get_gnid2pnid_map(ntype): - if ntype in gnid2pnid_mapping: - return gnid2pnid_mapping[ntype] - else: - pnid2gnid_map = node_mapping[ntype] - gnid2pnid_map = th.argsort(pnid2gnid_map) - gnid2pnid_mapping[ntype] = gnid2pnid_map - # del ntype in node_mapping to save memory - del node_mapping[ntype] - return gnid2pnid_mapping[ntype] - # iterate all the partitions to convert hard negative node ids. for i in range(num_parts): part_path = os.path.join(data_path, f"part{i}") @@ -1162,7 +1173,8 @@ def get_gnid2pnid_map(ntype): efeat_name = f"{etype}/{neg_feat}" hard_nids = edge_feats[efeat_name] hard_nid_idx = hard_nids > -1 - gnid2pnid_map = get_gnid2pnid_map(neg_ntype) + gnid2pnid_map = get_gnid2pnid_map(neg_ntype, node_mapping, + gnid2pnid_mapping) hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]] # replace the edge_feat.dgl with the updated one. diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index 45525894a0..cdb0930fdf 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -20,6 +20,7 @@ import torch as th from dgl.data.utils import load_tensors, save_tensors from graphstorm.model.utils import load_dist_nid_map +from graphstorm.gconstruct.utils import get_gnid2pnid_map def load_hard_negative_config(gsprocessing_config): @@ -76,17 +77,6 @@ def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path): node_mapping = load_dist_nid_map(f"{graph_path}/dist_graph", node_type_list) gnid2pnid_mapping = {} - def get_gnid2pnid_map(ntype): - if ntype in gnid2pnid_mapping: - return gnid2pnid_mapping[ntype] - else: - pnid2gnid_map = node_mapping[ntype] - gnid2pnid_map = th.argsort(pnid2gnid_map) - gnid2pnid_mapping[ntype] = gnid2pnid_map - # del ntype in node_mapping to save memory - del node_mapping[ntype] - return gnid2pnid_mapping[ntype] - # iterate all the partitions to convert hard negative node ids. for i in range(num_parts): part_path = os.path.join(f"{graph_path}/dist_graph", f"part{i}") @@ -101,7 +91,8 @@ def get_gnid2pnid_map(ntype): efeat_name = f"{etype}/{neg_feat}" hard_nids = edge_feats[efeat_name].long() hard_nid_idx = hard_nids > -1 - gnid2pnid_map = get_gnid2pnid_map(neg_ntype) + gnid2pnid_map = get_gnid2pnid_map(neg_ntype, node_mapping, + gnid2pnid_mapping) hard_nids[hard_nid_idx] = gnid2pnid_map[hard_nids[hard_nid_idx]] # replace the edge_feat.dgl with the updated one. diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index 410bc6eac3..2499de2814 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -395,7 +395,7 @@ def _exchange_node_id_mapping(rank, world_size, device, def load_dist_nid_map(node_id_mapping_file, ntypes): """ Load id mapping files in dist partition format. - + Parameters ---------- node_id_mapping_file: str @@ -408,11 +408,6 @@ def load_dist_nid_map(node_id_mapping_file, ntypes): id_mappings: dict Node mapping dictionary. """ - return _load_dist_nid_map(node_id_mapping_file, ntypes) - -def _load_dist_nid_map(node_id_mapping_file, ntypes): - """ Load id mapping files in dist partition format. - """ # node_id_mapping_file it is actually a directory # /part0, /part1, ... part_dirs = [part_path for part_path in os.listdir(node_id_mapping_file) \ @@ -464,7 +459,7 @@ def distribute_nid_map(embeddings, rank, world_size, else: # Homogeneous graph # node id mapping file from dgl tools/distpartitioning/convert_partition.py. - ori_node_id_mapping = _load_dist_nid_map(node_id_mapping_file, ["_N"])["_N"] + ori_node_id_mapping = load_dist_nid_map(node_id_mapping_file, ["_N"])["_N"] _, node_id_mapping = th.sort(ori_node_id_mapping) else: node_id_mapping = None @@ -479,7 +474,7 @@ def distribute_nid_map(embeddings, rank, world_size, node_id_mappings = th.load(node_id_mapping_file) else: # node id mapping file from dgl tools/distpartitioning/convert_partition.py. - node_id_mappings = _load_dist_nid_map(node_id_mapping_file, + node_id_mappings = load_dist_nid_map(node_id_mapping_file, list(embeddings.keys())) else: node_id_mappings = None @@ -1189,7 +1184,7 @@ def __init__(self, g, node_id_mapping_file, ntypes=None): id_mappings = th.load(node_id_mapping_file) if get_rank() == 0 else None else: # node id mapping file from dgl tools/distpartitioning/convert_partition.py. - id_mappings = _load_dist_nid_map(node_id_mapping_file, ntypes) \ + id_mappings = load_dist_nid_map(node_id_mapping_file, ntypes) \ if get_rank() == 0 else None self._id_mapping_info = { diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py index e8616c3280..368562eb3c 100644 --- a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py +++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py @@ -27,32 +27,11 @@ from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids, load_hard_negative_config) +_ROOT = os.path.abspath(os.path.dirname(__file__)) -def test_load_hard_negative_config(): - # For config with hard negative transformation - json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" - f"config/gsprocessing_hard_negative_config.json") - - res = load_hard_negative_config(json_file_path) - - assert res[0] == {'dst_node_type': 'paper', 'edge_type': - 'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'} - - # For config without hard negative transformation - json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" - f"config/gsprocessing_non_hard_negative_config.json") - res = load_hard_negative_config(json_file_path) - - assert res == [] - - -def test_shuffle_hard_negative_nids(tmp_path): - # For config with gsprocessing_config.json - json_file_path = (f"/graphstorm/tests/unit-tests/gpartition/" - f"config/gsprocessing_hard_negative_config.json") - - # Generate dgl graph +@pytest.fixture +def setup_graph_partition(tmp_path): partitioned_graph = f"{tmp_path}/partitioned_graph" # Generate ID mapping for each partition @@ -79,6 +58,36 @@ def test_shuffle_hard_negative_nids(tmp_path): reverse_map_dst = {gid: i for i, gid in enumerate(node_mapping["paper"].tolist())} reverse_map_dst[-1] = -1 + return partitioned_graph, reverse_map_dst + + +def test_load_hard_negative_config(): + # For config with hard negative transformation + json_file_path = os.path.join(_ROOT, + "config/gsprocessing_hard_negative_config.json") + + res = load_hard_negative_config(json_file_path) + + assert res[0] == {'dst_node_type': 'paper', 'edge_type': + 'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'} + + # For config without hard negative transformation + json_file_path = os.path.join(_ROOT, + "config/gsprocessing_non_hard_negative_config.json") + + res = load_hard_negative_config(json_file_path) + + assert res == [] + + +def test_shuffle_hard_negative_nids(setup_graph_partition): + # Test the hard negative id shuffling process within distributed setting + + partitioned_graph, reverse_map_dst = setup_graph_partition + # For config with gsprocessing_config.json + json_file_path = os.path.join(_ROOT, + "config/gsprocessing_hard_negative_config.json") + # generate edge features etype = ("author", "writing", "paper") edge_feat_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "edge_feat.dgl") From 32429532600006ccdb6056b4e881b3f9ead17d05 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 20:00:18 +0000 Subject: [PATCH 45/50] lint --- .../dist_hard_negative_transformation.py | 3 ++- python/graphstorm/gconstruct/utils.py | 8 ++++---- python/graphstorm/gpartition/post_hard_negative.py | 13 +++++++++++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 5ccee36a81..e600e6646e 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -93,7 +93,8 @@ def apply(self, input_df: DataFrame) -> DataFrame: ) transformed_df = transformed_df.join( hard_negative_node_mapping, - transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] == hard_negative_node_mapping[NODE_MAPPING_STR], + transformed_df[EXPLODE_HARD_NEGATIVE_VALUE] + == hard_negative_node_mapping[NODE_MAPPING_STR], "inner", ).select(NODE_MAPPING_INT, ORDER_INDEX) transformed_df = transformed_df.groupBy(ORDER_INDEX).agg( diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index a9b60fd872..e55d1f7173 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -1119,16 +1119,16 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops): return hard_edge_neg_feats def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping): - """ Get global nid to partitioned nid mapping + """ Get global nid to partitioned nid mapping. Parameters ---------- ntype: str Path to the directory storing the partitioned graph. - node_mapping: dict of list - Dict of mapping. {ntype: mapping} + node_mapping: dict + Dict of mapping. {ntype: partitioned nid to global nid mapping} gnid2pnid_mapping: dict - Dict of mapping from global nid to partitioned id mapping. + Dict of mapping. {ntype: global nid to partitioned nid mapping} """ if ntype in gnid2pnid_mapping: return gnid2pnid_mapping[ntype] diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index cdb0930fdf..402e68e302 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -17,7 +17,6 @@ import json import os -import torch as th from dgl.data.utils import load_tensors, save_tensors from graphstorm.model.utils import load_dist_nid_map from graphstorm.gconstruct.utils import get_gnid2pnid_map @@ -30,6 +29,17 @@ def load_hard_negative_config(gsprocessing_config): ---------------- gsprocessing_config: str Path to the gsprocessing config. + + Returns + ------- + list of dicts + A list of dict for each hard negative feature transformation. + Each dict will look like: + { + "dst_node_type": destination node type for hard negative, + "edge_type": edge_type, + "hard_neg_feat_name": feature name + } """ with open(gsprocessing_config, 'r', encoding='utf-8') as file: config = json.load(file) @@ -57,7 +67,6 @@ def load_hard_negative_config(gsprocessing_config): def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path): """Shuffle hard negative edge feature ids with int-to-int node id mapping. The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils. - Create an additional function to handle the id mappings under the distributed setting. Parameters ---------------- From 0dd32f9e11a3c78c1fc0fe9e18cf2a31cb3cf2e4 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 21:33:35 +0000 Subject: [PATCH 46/50] fix comment --- .../dist_transformations/dist_hard_negative_transformation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index e600e6646e..adaa270f98 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -101,7 +101,7 @@ def apply(self, input_df: DataFrame) -> DataFrame: F.collect_list(NODE_MAPPING_INT).alias(input_col) ) - # Same length for feature to convert to tensor + # Extend the feature to the same length as total number of nodes within one node type def pad_mapped_values(hard_neg_list): if len(hard_neg_list) < node_mapping_length: hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list))) From 7929f05de81d75a534ded95427bad9bd38dbbe52 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 22:34:30 +0000 Subject: [PATCH 47/50] change maximum size --- .../dist_hard_negative_transformation.py | 9 +++++---- .../tests/test_dist_hard_negative_transformation.py | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index adaa270f98..90d3b54031 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -15,7 +15,7 @@ """ from typing import Sequence -from pyspark.sql.functions import split, col +from pyspark.sql.functions import split, col, size from pyspark.sql.types import ArrayType, IntegerType, StringType from pyspark.sql import DataFrame, functions as F, SparkSession @@ -83,7 +83,8 @@ def apply(self, input_df: DataFrame) -> DataFrame: hard_negative_node_mapping = self.spark.read.parquet( f"{mapping_prefix}{dst_type}/{format_name}/" ) - node_mapping_length = hard_negative_node_mapping.count() + max_size = transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) \ + .agg(F.max(f"{input_col}_size")).collect()[0][0] # TODO: Use panda series to possibly improve the efficiency # Explode the original list and join node id mapping dataframe @@ -103,8 +104,8 @@ def apply(self, input_df: DataFrame) -> DataFrame: # Extend the feature to the same length as total number of nodes within one node type def pad_mapped_values(hard_neg_list): - if len(hard_neg_list) < node_mapping_length: - hard_neg_list.extend([-1] * (node_mapping_length - len(hard_neg_list))) + if len(hard_neg_list) < max_size: + hard_neg_list.extend([-1] * (max_size - len(hard_neg_list))) return hard_neg_list pad_value_udf = F.udf(pad_mapped_values, ArrayType(IntegerType())) diff --git a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py index f153301eb7..8932bd1dbe 100755 --- a/graphstorm-processing/tests/test_dist_hard_negative_transformation.py +++ b/graphstorm-processing/tests/test_dist_hard_negative_transformation.py @@ -61,8 +61,8 @@ def test_hard_negative_example_list(spark: SparkSession, check_df_schema, tmp_pa check_df_schema(output_df) output_data = output_df.collect() - # Length should be 4 for each tensor because there are 4 distinct nodes for dst node - expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] + # All the length should be the same as the maximum array. + expected_output = [[1, -1, -1], [2, 3, -1], [3, 0, 1], [0, -1, -1]] for idx, row in enumerate(output_data): np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal") @@ -101,8 +101,8 @@ def test_hard_negative_example_str(spark: SparkSession, check_df_schema, tmp_pat check_df_schema(output_df) output_data = output_df.collect() - # Length should be 4 for each tensor because there are 4 distinct nodes for dst node - expected_output = [[1, -1, -1, -1], [2, 3, -1, -1], [3, 0, 1, -1], [0, -1, -1, -1]] + # All the length should be the same as the maximum array. + expected_output = [[1, -1, -1], [2, 3, -1], [3, 0, 1], [0, -1, -1]] for idx, row in enumerate(output_data): np.testing.assert_equal(row[0], expected_output[idx], err_msg=f"Row {idx} is not equal") From d2e03dba1ae0808d4681d5cb94ca5665422e30fa Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 22:39:41 +0000 Subject: [PATCH 48/50] black --- .../dist_hard_negative_transformation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 90d3b54031..b700fc89fa 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -83,8 +83,11 @@ def apply(self, input_df: DataFrame) -> DataFrame: hard_negative_node_mapping = self.spark.read.parquet( f"{mapping_prefix}{dst_type}/{format_name}/" ) - max_size = transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) \ - .agg(F.max(f"{input_col}_size")).collect()[0][0] + max_size = ( + transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) + .agg(F.max(f"{input_col}_size")) + .collect()[0][0] + ) # TODO: Use panda series to possibly improve the efficiency # Explode the original list and join node id mapping dataframe From d4da2ca3efcb458521afe47db032bb54414fbeea Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 12 Nov 2024 22:43:46 +0000 Subject: [PATCH 49/50] remove size --- .../dist_transformations/dist_hard_negative_transformation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index b700fc89fa..2a72ab4d6c 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -15,7 +15,7 @@ """ from typing import Sequence -from pyspark.sql.functions import split, col, size +from pyspark.sql.functions import split, col from pyspark.sql.types import ArrayType, IntegerType, StringType from pyspark.sql import DataFrame, functions as F, SparkSession From 35d4cbd3b1dc5fc4139728b59d0d2b761c841e50 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Wed, 13 Nov 2024 23:17:01 +0000 Subject: [PATCH 50/50] test --- .../dist_hard_negative_transformation.py | 4 +++- python/graphstorm/gconstruct/utils.py | 23 +++++++++++++++---- .../gpartition/post_hard_negative.py | 5 ++-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py index 2a72ab4d6c..35dd4005cc 100755 --- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py +++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_hard_negative_transformation.py @@ -83,6 +83,7 @@ def apply(self, input_df: DataFrame) -> DataFrame: hard_negative_node_mapping = self.spark.read.parquet( f"{mapping_prefix}{dst_type}/{format_name}/" ) + # The maximum number of negatives in the input feature column max_size = ( transformed_df.select(F.size(F.col(input_col)).alias(f"{input_col}_size")) .agg(F.max(f"{input_col}_size")) @@ -92,6 +93,7 @@ def apply(self, input_df: DataFrame) -> DataFrame: # TODO: Use panda series to possibly improve the efficiency # Explode the original list and join node id mapping dataframe transformed_df = transformed_df.withColumn(ORDER_INDEX, F.monotonically_increasing_id()) + # Could result in extremely large DFs in num_nodes * avg(len_of_negatives) rows transformed_df = transformed_df.withColumn( EXPLODE_HARD_NEGATIVE_VALUE, F.explode(F.col(input_col)) ) @@ -105,7 +107,7 @@ def apply(self, input_df: DataFrame) -> DataFrame: F.collect_list(NODE_MAPPING_INT).alias(input_col) ) - # Extend the feature to the same length as total number of nodes within one node type + # Extend the feature to the same length as the maximum length of the feature column def pad_mapped_values(hard_neg_list): if len(hard_neg_list) < max_size: hard_neg_list.extend([-1] * (max_size - len(hard_neg_list))) diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index e55d1f7173..d2077f632e 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -1118,17 +1118,32 @@ def get_hard_edge_negs_feats(hard_edge_neg_ops): return hard_edge_neg_feats -def get_gnid2pnid_map(ntype, node_mapping, gnid2pnid_mapping): +def get_gnid2pnid_map(ntype: str, node_mapping: dict, gnid2pnid_mapping: dict): """ Get global nid to partitioned nid mapping. Parameters ---------- ntype: str - Path to the directory storing the partitioned graph. + Node type. node_mapping: dict - Dict of mapping. {ntype: partitioned nid to global nid mapping} + Dict of mapping. + { + ntype: 1D tensor representing the mapping from + partition node IDs (pnid) to global node IDs (gnid). + Each index corresponds to a partition node IDs, and + the value at that index is the global node IDs. + } gnid2pnid_mapping: dict - Dict of mapping. {ntype: global nid to partitioned nid mapping} + Dict of mapping. Here are the mapping represented: + { + ntype: 1D tensor representing the mapping from + global node IDs (gnid) to partition node IDs (pnid). + Each index corresponds to a global node ID, and + the value at that index is the partition node ID. + } + + Returns + 1-D node Mapping Tensor for target node type. """ if ntype in gnid2pnid_mapping: return gnid2pnid_mapping[ntype] diff --git a/python/graphstorm/gpartition/post_hard_negative.py b/python/graphstorm/gpartition/post_hard_negative.py index 402e68e302..0c86d85fc5 100644 --- a/python/graphstorm/gpartition/post_hard_negative.py +++ b/python/graphstorm/gpartition/post_hard_negative.py @@ -22,7 +22,7 @@ from graphstorm.gconstruct.utils import get_gnid2pnid_map -def load_hard_negative_config(gsprocessing_config): +def load_hard_negative_config(gsprocessing_config: str): """Load GSProcessing Config to extract hard negative config Parameters @@ -64,7 +64,8 @@ def load_hard_negative_config(gsprocessing_config): return hard_neg_list -def shuffle_hard_negative_nids(gsprocessing_config, num_parts, graph_path): +def shuffle_hard_negative_nids(gsprocessing_config: str, + num_parts: int, graph_path: str): """Shuffle hard negative edge feature ids with int-to-int node id mapping. The function here align with the shuffle_hard_nids in graphstorm.gconstruct.utils.