From 0fc856cf0ff1dfa722135b6960c507f5f3f0e420 Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Wed, 27 Sep 2023 10:17:12 -0700 Subject: [PATCH] [GSProcessing] Name reverse edges as `dst:relation-rev:src` (#490) *Issue #, if available:* *Description of changes:* * We use `dst:relation-rev:src` for reverse edge names instead of `dst:rev-relation:src` to match GConstruct. *Testing*: * pytest and test job (with re-partition follow-up) succeed By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --- .../dist_heterogeneous_loader.py | 6 ++-- .../repartition_files.py | 4 +-- .../repartitioning/partitioned_metadata.json | 6 ++-- .../tests/test_dist_heterogenous_loader.py | 28 +++++++++---------- .../tests/test_repartition_files.py | 4 +-- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index bb844d1ef0..93a5d08109 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -315,7 +315,7 @@ def _initialize_metadata_dict( # Add original and reverse edge types edge_types.append(f"{src_type}:{rel_type}:{dst_type}") if self.add_reverse_edges: - edge_types.append(f"{dst_type}:rev-{rel_type}:{src_type}") + edge_types.append(f"{dst_type}:{rel_type}-rev:{src_type}") metadata_dict["edge_type"] = edge_types metadata_dict["node_type"] = sorted(node_type_set) @@ -1072,7 +1072,7 @@ def write_edge_structure( f"{edge_config.src_ntype}:{edge_config.get_relation_name()}:{edge_config.dst_ntype}" ) rev_edge_type = ( - f"{edge_config.dst_ntype}:rev-{edge_config.get_relation_name()}:{edge_config.src_ntype}" + f"{edge_config.dst_ntype}:{edge_config.get_relation_name()}-rev:{edge_config.src_ntype}" ) src_node_id_mapping = ( @@ -1223,7 +1223,7 @@ def process_edge_data(self, edge_configs: Sequence[EdgeConfig]) -> Tuple[Dict, D ) reverse_edge_type = ( f"{edge_config.dst_ntype}" - f":rev-{edge_config.get_relation_name()}" + f":{edge_config.get_relation_name()}-rev" f":{edge_config.src_ntype}" ) logging.info("Processing edge type '%s'...", edge_type) diff --git a/graphstorm-processing/graphstorm_processing/repartition_files.py b/graphstorm-processing/graphstorm_processing/repartition_files.py index d0105267f0..1d13d34253 100644 --- a/graphstorm-processing/graphstorm_processing/repartition_files.py +++ b/graphstorm-processing/graphstorm_processing/repartition_files.py @@ -833,7 +833,7 @@ def main(): for type_idx, (type_name, type_data_dict) in enumerate(edge_data_meta.items()): src, relation, dst = type_name.split(":") - if relation.startswith("rev-"): + if relation.endswith("-rev"): # Reverse edge types do not have their own data, # and if needed we re-partition their structure while # handling the "regular" edge type. @@ -845,7 +845,7 @@ def main(): type_name, ) continue - reverse_edge_type_name = f"{dst}:rev-{relation}:{src}" + reverse_edge_type_name = f"{dst}:{relation}-rev:{src}" most_frequent_counts = list(edge_row_counts_frequencies[type_name].most_common(1)[0][0]) repartitioner = ParquetRepartitioner( input_prefix, filesystem_type, region, verify_outputs=True diff --git a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json index 3ddaa0e2ca..bbdf13eb03 100644 --- a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json +++ b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json @@ -1,7 +1,7 @@ { "edge_type": [ "src:dummy_type:dst", - "dst:rev-dummy_type:src" + "dst:dummy_type-rev:src" ], "edges": { "src:dummy_type:dst": { @@ -18,7 +18,7 @@ "delimiter": "" } }, - "dst:rev-dummy_type:src": { + "dst:dummy_type-rev:src": { "data": [ "edges/dummy_type/parquet/part-00000.parquet", "edges/dummy_type/parquet/part-00001.parquet", @@ -78,7 +78,7 @@ } } }, - "dst:rev-dummy_type:src": { + "dst:dummy_type-rev:src": { "label": { "data":[ "edge_data/dummy_type-label/parquet/part-00000.parquet", diff --git a/graphstorm-processing/tests/test_dist_heterogenous_loader.py b/graphstorm-processing/tests/test_dist_heterogenous_loader.py index c005a6527d..13e5038f9f 100644 --- a/graphstorm-processing/tests/test_dist_heterogenous_loader.py +++ b/graphstorm-processing/tests/test_dist_heterogenous_loader.py @@ -165,11 +165,11 @@ def verify_integ_test_output( assert metadata["node_type"] == ["director", "genre", "movie", "user"] assert metadata["edge_type"] == [ "movie:included_in:genre", - "genre:rev-included_in:movie", + "genre:included_in-rev:movie", "user:rated:movie", - "movie:rev-rated:user", + "movie:rated-rev:user", "director:directed:movie", - "movie:rev-directed:director", + "movie:directed-rev:director", ] expected_node_counts = {"director": 3, "genre": 2, "movie": 4, "user": 5} @@ -182,11 +182,11 @@ def verify_integ_test_output( expected_edge_counts = { "movie:included_in:genre": 4, - "genre:rev-included_in:movie": 4, + "genre:included_in-rev:movie": 4, "user:rated:movie": 6, - "movie:rev-rated:user": 6, + "movie:rated-rev:user": 6, "director:directed:movie": 4, - "movie:rev-directed:director": 4, + "movie:directed-rev:director": 4, } for edge_type in metadata["edge_type"]: @@ -266,11 +266,11 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra "task_type": "link_predict", "etype_label": [ "movie:included_in:genre", - "genre:rev-included_in:movie", + "genre:included_in-rev:movie", "user:rated:movie", - "movie:rev-rated:user", + "movie:rated-rev:user", "director:directed:movie", - "movie:rev-directed:director", + "movie:directed-rev:director", ], "etype_label_property": [], "ntype_label": [], @@ -283,18 +283,18 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra expected_edge_data = { "user:rated:movie": {"train_mask", "val_mask", "test_mask"}, - "movie:rev-rated:user": {"train_mask", "val_mask", "test_mask"}, + "movie:rated-rev:user": {"train_mask", "val_mask", "test_mask"}, "movie:included_in:genre": {"train_mask", "val_mask", "test_mask"}, - "genre:rev-included_in:movie": {"train_mask", "val_mask", "test_mask"}, + "genre:included_in-rev:movie": {"train_mask", "val_mask", "test_mask"}, "director:directed:movie": {"train_mask", "val_mask", "test_mask"}, - "movie:rev-directed:director": {"train_mask", "val_mask", "test_mask"}, + "movie:directed-rev:director": {"train_mask", "val_mask", "test_mask"}, } for edge_type in metadata["edge_data"]: assert metadata["edge_data"][edge_type].keys() == expected_edge_data[edge_type] - if not "rev-" in edge_type: + if not "-rev" in edge_type: src_type, relation, dst_type = edge_type.split(":") - rev_type = f"{dst_type}:rev-{relation}:{src_type}" + rev_type = f"{dst_type}:{relation}-rev:{src_type}" assert ( metadata["edge_data"][rev_type]["train_mask"] == metadata["edge_data"][edge_type]["train_mask"] diff --git a/graphstorm-processing/tests/test_repartition_files.py b/graphstorm-processing/tests/test_repartition_files.py index e2a08968db..eb9dc45bba 100644 --- a/graphstorm-processing/tests/test_repartition_files.py +++ b/graphstorm-processing/tests/test_repartition_files.py @@ -217,8 +217,8 @@ def test_verify_metadata_only_edge_data(): row_counts = [10, 10, 10, 10, 10] original_metadata_dict["edge_data"]["src:dummy_type:dst"]["label"]["row_counts"] = row_counts original_metadata_dict["edges"]["src:dummy_type:dst"]["row_counts"] = row_counts - original_metadata_dict["edges"].pop("dst:rev-dummy_type:src") - original_metadata_dict["edge_data"].pop("dst:rev-dummy_type:src") + original_metadata_dict["edges"].pop("dst:dummy_type-rev:src") + original_metadata_dict["edge_data"].pop("dst:dummy_type-rev:src") # Ensure success when counts match repartition_files.verify_metadata(