From 1f085da8be3620223376d36930ac13ad1741a2cf Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Mon, 4 Nov 2024 23:42:26 +0000
Subject: [PATCH] add test for gspartition part

---
 tests/unit-tests/gpartition/conftest.py       |   7 +-
 .../test_hard_negative_post_partition.py      | 340 ++++++++++++++++++
 2 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit-tests/gpartition/test_hard_negative_post_partition.py

diff --git a/tests/unit-tests/gpartition/conftest.py b/tests/unit-tests/gpartition/conftest.py
index 6e522e3a5e..9f2770e1c7 100644
--- a/tests/unit-tests/gpartition/conftest.py
+++ b/tests/unit-tests/gpartition/conftest.py
@@ -21,6 +21,8 @@
 import pytest
 
 from graphstorm.gpartition import LocalPartitionAlgorithm
+from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
+                                                      load_hard_negative_config)
 
 @pytest.fixture(scope="module", name="chunked_metadata_dict")
 def metadata_dict_fixture() -> Dict:
@@ -29,6 +31,7 @@ def metadata_dict_fixture() -> Dict:
         "node_type": ["a", "b"],
     }
 
+
 def simple_test_partition(
     partition_algorithm: LocalPartitionAlgorithm,
     algorithm_name: str,
@@ -61,7 +64,7 @@ def simple_test_partition(
         with open(os.path.join(tmpdir, "partition_meta.json"), 'r', encoding="utf-8") as f:
             part_meta = json.load(f)
             assert part_meta["num_parts"] == num_parts
-            assert part_meta["algo_name"] ==  algorithm_name
+            assert part_meta["algo_name"] == algorithm_name
 
         # Ensure contents of partition assignment files are correct
         for i, node_type in enumerate(chunked_metadata_dict["node_type"]):
@@ -70,4 +73,4 @@ def simple_test_partition(
                 assert len(node_partitions) == chunked_metadata_dict["num_nodes_per_type"][i]
                 for part_id in node_partitions:
                     assert part_id.isdigit()
-                    assert int(part_id) < num_parts
+                    assert int(part_id) < num_parts
\ No newline at end of file
diff --git a/tests/unit-tests/gpartition/test_hard_negative_post_partition.py b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
new file mode 100644
index 0000000000..0c3b32bf05
--- /dev/null
+++ b/tests/unit-tests/gpartition/test_hard_negative_post_partition.py
@@ -0,0 +1,340 @@
+"""
+    Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+"""
+import os
+import json
+import torch as th
+import numpy as np
+from typing import Dict
+
+import pytest
+
+from numpy.testing import assert_almost_equal
+from graphstorm.model.utils import load_dist_nid_map
+from dgl.data.utils import load_tensors, save_tensors
+from graphstorm.gpartition.post_hard_negative import (shuffle_hard_negative_nids,
+                                                      load_hard_negative_config)
+
+@pytest.fixture(scope="module", name="gsprocessing_hard_negative_config")
+def gsprocessing_config_hard_negative_dict_fixture() -> Dict:
+    return{
+        "graph": {
+            "nodes": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/author.parquet"
+                        ]
+                    },
+                    "type": "author",
+                    "column": "node_id",
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/paper.parquet"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "node_id",
+                    "features": [
+                        {
+                            "column": "feat",
+                            "name": "feat",
+                            "transformation": {
+                                "name": "no-op"
+                            }
+                        }
+                    ],
+                    "labels": [
+                        {
+                            "column": "label",
+                            "type": "classification",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ],
+            "edges": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/author_writing_paper_hard_negative.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "author"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "writing"
+                    },
+                    "features": [
+                        {
+                            "column": "hard_neg",
+                            "name": "hard_neg_feat",
+                            "transformation": {
+                                "name": "edge_dst_hard_negative",
+                                "kwargs": {
+                                    "separator": ";"
+                                }
+                            }
+                        }
+                    ]
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/paper_citing_paper.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "paper"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "citing"
+                    },
+                    "labels": [
+                        {
+                            "column": "",
+                            "type": "link_prediction",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ]
+        },
+        "version": "gsprocessing-v1.0"
+    }
+
+
+@pytest.fixture(scope="module", name="gsprocessing_non_hard_negative_config")
+def gsprocessing_config_non_hard_negative_dict_fixture() -> Dict:
+    return{
+        "graph": {
+            "nodes": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/author.parquet"
+                        ]
+                    },
+                    "type": "author",
+                    "column": "node_id",
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./nodes/paper.parquet"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "node_id",
+                    "features": [
+                        {
+                            "column": "feat",
+                            "name": "feat",
+                            "transformation": {
+                                "name": "no-op"
+                            }
+                        }
+                    ],
+                    "labels": [
+                        {
+                            "column": "label",
+                            "type": "classification",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ],
+            "edges": [
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/author_writing_paper_hard_negative.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "author"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "writing"
+                    }
+                },
+                {
+                    "data": {
+                        "format": "parquet",
+                        "files": [
+                            "./edges/paper_citing_paper.parquet"
+                        ]
+                    },
+                    "source": {
+                        "column": "source_id",
+                        "type": "paper"
+                    },
+                    "dest": {
+                        "column": "dest_id",
+                        "type": "paper"
+                    },
+                    "relation": {
+                        "type": "citing"
+                    },
+                    "labels": [
+                        {
+                            "column": "",
+                            "type": "link_prediction",
+                            "split_rate": {
+                                "train": 0.8,
+                                "val": 0.1,
+                                "test": 0.1
+                            }
+                        }
+                    ]
+                }
+            ]
+        },
+        "version": "gsprocessing-v1.0"
+    }
+
+
+def test_load_hard_negative_config(tmp_path, gsprocessing_hard_negative_config: Dict,
+                                   gsprocessing_non_hard_negative_config: Dict):
+    # For config with gsprocessing_config.json
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res[0] == {'dst_node_type': 'paper', 'edge_type':
+        'author:writing:paper', 'hard_neg_feat_name': 'hard_neg_feat'}
+
+    # For config without hard negative feature definition
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_non_hard_negative_config,
+                  json_file, indent=4)
+
+    res = load_hard_negative_config(json_file_path)
+
+    assert res == []
+
+
+def test_shuffle_hard_negative_nids(tmp_path, gsprocessing_hard_negative_config: Dict):
+    # For config with gsprocessing_config.json
+    json_file_path = f"{tmp_path}/gsprocessing_config.json"
+
+    # Write the dictionary to the JSON file
+    with open(json_file_path, 'w') as json_file:
+        json.dump(gsprocessing_hard_negative_config, json_file, indent=4)
+
+    # Generate dgl graph
+    partitioned_graph = f"{tmp_path}/partitioned_graph"
+
+    # Generate ID mapping for each partition
+    nid_map_dict_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "orig_nids.dgl")
+    nid_map_dict_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "orig_nids.dgl")
+    os.makedirs(os.path.dirname(nid_map_dict_path0), exist_ok=True)
+    os.makedirs(os.path.dirname(nid_map_dict_path1), exist_ok=True)
+
+    # Use randperm in the test otherwise there maybe no mapping necessary
+    nid_map0 = {
+        "paper": th.randperm(100),
+        "author": th.arange(200, 300)
+    }
+    save_tensors(nid_map_dict_path0, nid_map0)
+
+    nid_map1 = {
+        "paper": th.randperm(100) + 100,
+        "author": th.arange(300, 400)
+    }
+    save_tensors(nid_map_dict_path1, nid_map1)
+
+    # Create reversed map
+    node_mapping = load_dist_nid_map(f"{partitioned_graph}/dist_graph", ["author", "paper"])
+    reverse_map_dst = {gid: i for i, gid in enumerate(node_mapping["paper"].tolist())}
+    reverse_map_dst[-1] = -1
+
+    # generate edge features
+    etype = ("author", "writing", "paper")
+    edge_feat_path0 = os.path.join(partitioned_graph, "dist_graph", "part0", "edge_feat.dgl")
+    edge_feat_path1 = os.path.join(partitioned_graph, "dist_graph", "part1", "edge_feat.dgl")
+    os.makedirs(os.path.dirname(edge_feat_path0), exist_ok=True)
+    os.makedirs(os.path.dirname(edge_feat_path1), exist_ok=True)
+
+    paper_writing_hard_neg0 = th.cat((th.randint(0, 100, (100, 100)),
+                                        th.full((100, 10), -1, dtype=th.int32)), dim=1)
+    paper_writing_hard_neg0_shuffled = [
+        [reverse_map_dst[nid] for nid in negs] \
+        for negs in paper_writing_hard_neg0.tolist()]
+    paper_writing_hard_neg0_shuffled = np.array(paper_writing_hard_neg0_shuffled)
+    paper_writing_hard_neg1 = th.cat((th.randint(100, 200, (100, 100)),
+                                        th.full((100, 10), -1, dtype=th.int32)), dim=1)
+    paper_writing_hard_neg1_shuffled = [
+        [reverse_map_dst[nid] for nid in negs] \
+        for negs in paper_writing_hard_neg1.tolist()]
+    paper_writing_hard_neg1_shuffled = np.array(paper_writing_hard_neg1_shuffled)
+
+    save_tensors(edge_feat_path0, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg0})
+    save_tensors(edge_feat_path1, {":".join(etype)+"/hard_neg_feat": paper_writing_hard_neg1})
+
+    # Do the shuffling
+    shuffle_hard_negative_nids(json_file_path, 2, partitioned_graph)
+
+    # Assert
+    paper_writing_hard_neg0 = load_tensors(edge_feat_path0)
+    assert_almost_equal(paper_writing_hard_neg0[":".join(etype) + "/hard_neg_feat"].numpy(),
+                        paper_writing_hard_neg0_shuffled)
+    paper_writing_hard_neg1 = load_tensors(edge_feat_path1)
+    assert_almost_equal(paper_writing_hard_neg1[":".join(etype) + "/hard_neg_feat"].numpy(),
+                        paper_writing_hard_neg1_shuffled)
\ No newline at end of file