Add more tests

awslabs · Dec 22, 2023 · cc203c3 · cc203c3
1 parent 09527ba
commit cc203c3
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 2 deletions.
diff --git a/python/graphstorm/gconstruct/file_io.py b/python/graphstorm/gconstruct/file_io.py
@@ -20,6 +20,7 @@
 import glob
 import json
 import os
+import logging
 
 import pyarrow.parquet as pq
 import pyarrow as pa
@@ -222,7 +223,10 @@ def read_data_parquet(data_file, data_fields=None):
                 d = np.stack(new_d)
             except Exception: # pylint: disable=broad-exception-caught
                 # keep it as an ndarry of ndarrys
-                # It may happen loading hard negatives for hard negative transform.
+                # It may happen when loading hard negatives for hard negative transformation.
+                logging.warning("The %s column of parquet file %s has " \
+                    "variable length of feature, it is only suported when " \
+                    "transformation is a hard negative transformation", key, data_file)
                 pass
         data[key] = d
     return data

diff --git a/tests/end2end-tests/data_process/data_gen.py b/tests/end2end-tests/data_process/data_gen.py
@@ -20,7 +20,7 @@
 import json
 import os
 import pyarrow.parquet as pq
-import pyarrow as pa
+import pandas as pd
 import numpy as np
 
 from graphstorm.gconstruct.file_io import write_data_parquet, write_data_json, write_data_csv
@@ -121,6 +121,10 @@ def gen_rand_nid(max_nid, num_nodes):
 edge_data3_2 = {
     'data': src3 + node_id3[dst_idx],
 }
+edge_data3_3 = {
+    'data': [[nid, nid] for nid in dst3]
+}
+edge_data3_3['data'][0] = edge_data3_3['data'][0] + edge_data3_3['data'][0]
 
 in_dir = '/tmp/test_data/'
 out_dir = '/tmp/test_out/'
@@ -153,6 +157,10 @@ def split_data(data, num):
 write_data_hdf5(edge_data1_2, os.path.join(in_dir, f'edge_data1_2.hdf5'))
 for i, edge_data in enumerate(split_data(edge_data3, 10)):
     write_data_parquet(edge_data, os.path.join(in_dir, f'edge_data3_{i}.parquet'))
+df = pd.DataFrame(edge_data3_3)
+df.to_parquet(os.path.join(in_dir,
+                           f'ng_edge_data3.parquet'))
+
 write_data_hdf5(edge_data3_2, os.path.join(in_dir, f'edge_data3_2.hdf5'))
 
 
@@ -383,6 +391,18 @@ def split_data(data, num):
             }
         ],
     },
+    {
+        "relation":         ("node2", "relation3", "node3"),
+        "format":           {"name": "parquet"},
+        "files":            os.path.join(in_dir, "ng_edge_data3.parquet"),
+        "features": [
+            {
+                "feature_col": "data",
+                "feature_name": "hard_neg2",
+                "transform": {"name": "edge_dst_hard_negative"}
+            },
+        ],
+    },
     {
         "relation":         ("node2", "relation3", "node3"),
         "format":           {"name": "hdf5"},

diff --git a/tests/end2end-tests/data_process/test_data.py b/tests/end2end-tests/data_process/test_data.py
@@ -215,3 +215,10 @@ def read_data_parquet(data_file):
 _, dst_ids = g.edges(etype=("node2", "relation3", "node3"))
 ground_truth = th.cat((dst_ids.reshape(-1,1), dst_ids.reshape(-1,1)), dim=1)
 assert th.sum(hard_neg-ground_truth) == 0
+
+hard_neg = g.edges[("node2", "relation3", "node3")].data["hard_neg2"]
+_, dst_ids = g.edges(etype=("node2", "relation3", "node3"))
+ground_truth = th.cat([dst_ids.reshape(-1,1), dst_ids.reshape(-1,1), th.full((dst_ids.shape[0], 2), -1, dtype=dst_ids.dtype)], dim=1)
+ground_truth[0][2] = dst_ids[0]
+ground_truth[0][3] = dst_ids[0]
+assert th.sum(hard_neg-ground_truth) == 0