From d95ff66e330d6ffa7e5ebefc09c953f508e64a33 Mon Sep 17 00:00:00 2001 From: JalenCato Date: Tue, 17 Oct 2023 21:51:21 +0000 Subject: [PATCH] change user interface --- .../gconstruct-config.json | 5 +- python/graphstorm/gconstruct/transform.py | 77 ++++++++++++++----- tests/unit-tests/gconstruct/test_transform.py | 26 +++++-- 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json b/graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json index 11bb236af9..6c4b64d451 100644 --- a/graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json +++ b/graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json @@ -43,8 +43,9 @@ { "feature_col": "age", "transform": { - "name": "bucket", - "bucket": [3, 10, 20, 30, 40]} + "name": "bucket_numerical", + "range": [10, 40], + "bucket_cnt": 4} } ] } diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index e7ba438e8e..788028fd69 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -342,16 +342,23 @@ class BucketTransform(FeatTransform): The name of the column that contains the feature. feat_name : str The feature name used in the constructed graph. - bucket: list[num]: - The bucket list used for bucket transformation + bucket_cnt: num: + The count of bucket lists used in the bucket feature transform + range: list[num]: + The range of bucket lists only defining the start and end point out_dtype: The dtype of the transformed feature. Default: None, we will not do data type casting. """ - def __init__(self, col_name, feat_name, bucket, out_dtype=None): - assert bucket is not None, "bucket must be provided for bucket transform" - self.bucket = bucket - self.bucket_cnt = len(bucket) - 1 + def __init__(self, col_name, feat_name, bucket_cnt, + range, slide_window_size=0, out_dtype=None): + assert bucket_cnt is not None, \ + "bucket count must be provided for bucket feature transform" + assert range is not None, \ + "bucket range must be provided for bucket feature transform" + self.bucket_cnt = bucket_cnt + self.range = range + self.slide_window_size = slide_window_size out_dtype = np.float32 if out_dtype is None else out_dtype super(BucketTransform, self).__init__(col_name, feat_name, out_dtype) @@ -372,18 +379,37 @@ def call(self, feats): assert np.issubdtype(feats.dtype, np.integer) \ or np.issubdtype(feats.dtype, np.floating), \ f"The feature {self.feat_name} has to be integers or floats." - bucket = sorted(self.bucket) - bin_indices = pd.cut(feats, bucket, labels=False) - for i, ele in enumerate(bin_indices): - if pd.notnull(ele): - continue - bin_indices[i] = 0. if feats[i] <= self.bucket[0] \ - else self.bucket_cnt - 1 - bin_indices = bin_indices.astype(np.int64) + print("num value: ", feats) encoding = np.zeros((len(feats), self.bucket_cnt), dtype=np.int8) - for i, emb in enumerate(encoding): - emb[bin_indices[i]] = 1 + max_val = max(self.range) + min_val = min(self.range) + bucket_size = (max_val - min_val) / self.bucket_cnt + for i, f in enumerate(feats): + high_val = min(f + self.slide_window_size / 2, max_val) + low_val = max(f - self.slide_window_size / 2, min_val) + + # Early exits to avoid numpy calls + membership_list = [0.0] * self.bucket_cnt + if f >= max_val: + membership_list[-1] = 1.0 + encoding[i] = membership_list + continue + if f <= min_val: + membership_list[0] = 1.0 + encoding[i] = membership_list + continue + + # Determine upper and lower bucket membership + low_val -= min_val + high_val -= min_val + low_idx = max(round(low_val / bucket_size), 0) + high_idx = min((round(high_val / bucket_size)) + 1, self.bucket_cnt) + + idx = np.arange(start=low_idx, stop=high_idx, dtype=int) + membership_list = np.zeros(self.bucket_cnt, dtype=float) + membership_list[idx] = 1.0 + encoding[i] = membership_list return {self.feat_name: encoding} @@ -983,13 +1009,22 @@ def parse_feat_ops(confs): separator = conf['separator'] if 'separator' in conf else None transform = CategoricalTransform(feat['feature_col'], feat_name, separator=separator, transform_conf=conf) - elif conf['name'] == 'bucket': - assert 'bucket' in conf, \ - "It is required to provide bucket information for bucket feature transform" - bucket = conf['bucket'] + elif conf['name'] == 'bucket_numerical': + assert 'bucket_cnt' in conf, \ + "It is required to count of bucket information for bucket feature transform" + assert 'range' in conf, \ + "It is required to provide range information for bucket feature transform" + bucket_cnt = conf['bucket_cnt'] + range = conf['range'] + if 'slide_window_size' in conf: + slide_window_size = conf['slide_window_size'] + else: + slide_window_size = 0 transform = BucketTransform(feat['feature_col'], feat_name, - bucket=bucket, + bucket_cnt=bucket_cnt, + range=range, + slide_window_size=slide_window_size, out_dtype=out_dtype) else: raise ValueError('Unknown operation: {}'.format(conf['name'])) diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py index be36075b03..d488c0ce3c 100644 --- a/tests/unit-tests/gconstruct/test_transform.py +++ b/tests/unit-tests/gconstruct/test_transform.py @@ -637,8 +637,9 @@ def test_classification_processor(): @pytest.mark.parametrize("out_dtype", [None, np.float16]) def test_bucket_transform(out_dtype): - bucket = [10, 20, 30] - transform = BucketTransform("test", "test", bucket=bucket, out_dtype=out_dtype) + bucket_range = [10, 30] + transform = BucketTransform("test", "test", 2, + range=bucket_range, slide_window_size=0, out_dtype=out_dtype) feats = np.zeros(4) feats[0], feats[1], feats[2], feats[3] = 1, 11, 21, 31 bucket_feats = transform(feats) @@ -650,9 +651,10 @@ def test_bucket_transform(out_dtype): feats_tar = np.array([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=out_dtype) assert_equal(bucket_feats['test'], feats_tar) - bucket = [1.1, 2.1, 3.1] - feats[0], feats[1], feats[2], feats[3] = 0.1, 1.2, 2.2, 3.2 - transform = BucketTransform("test", "test", bucket=bucket, out_dtype=out_dtype) + bucket_range = [1.1, 2.1, 3.1] + feats[0], feats[1], feats[2], feats[3] = 0.2, 1.2, 2.2, 3.2 + transform = BucketTransform("test", "test", 2, + range=bucket_range, slide_window_size=0, out_dtype=out_dtype) bucket_feats = transform(feats) if out_dtype is not None: assert bucket_feats['test'].dtype == np.float16 @@ -662,6 +664,20 @@ def test_bucket_transform(out_dtype): feats_tar = np.array([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=out_dtype) assert_equal(bucket_feats['test'], feats_tar) + bucket_range = [10, 30] + transform = BucketTransform("test", "test", 2, + range=bucket_range, slide_window_size=10, out_dtype=out_dtype) + feats = np.zeros(4) + feats[0], feats[1], feats[2], feats[3] = 1, 11, 21, 31 + bucket_feats = transform(feats) + if out_dtype is not None: + assert bucket_feats['test'].dtype == np.float16 + else: + assert bucket_feats['test'].dtype == np.float32 + + feats_tar = np.array([[1, 0], [1, 0], [1, 1], [0, 1]], dtype=out_dtype) + assert_equal(bucket_feats['test'], feats_tar) + if __name__ == '__main__': test_categorize_transform() test_get_output_dtype()