Skip to content

Commit

Permalink
change user interface
Browse files Browse the repository at this point in the history
  • Loading branch information
jalencato committed Oct 17, 2023
1 parent 4a6467c commit d95ff66
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@
{
"feature_col": "age",
"transform": {
"name": "bucket",
"bucket": [3, 10, 20, 30, 40]}
"name": "bucket_numerical",
"range": [10, 40],
"bucket_cnt": 4}
}
]
}
Expand Down
77 changes: 56 additions & 21 deletions python/graphstorm/gconstruct/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,16 +342,23 @@ class BucketTransform(FeatTransform):
The name of the column that contains the feature.
feat_name : str
The feature name used in the constructed graph.
bucket: list[num]:
The bucket list used for bucket transformation
bucket_cnt: num:
The count of bucket lists used in the bucket feature transform
range: list[num]:
The range of bucket lists only defining the start and end point
out_dtype:
The dtype of the transformed feature.
Default: None, we will not do data type casting.
"""
def __init__(self, col_name, feat_name, bucket, out_dtype=None):
assert bucket is not None, "bucket must be provided for bucket transform"
self.bucket = bucket
self.bucket_cnt = len(bucket) - 1
def __init__(self, col_name, feat_name, bucket_cnt,
range, slide_window_size=0, out_dtype=None):
assert bucket_cnt is not None, \
"bucket count must be provided for bucket feature transform"
assert range is not None, \
"bucket range must be provided for bucket feature transform"
self.bucket_cnt = bucket_cnt
self.range = range
self.slide_window_size = slide_window_size
out_dtype = np.float32 if out_dtype is None else out_dtype
super(BucketTransform, self).__init__(col_name, feat_name, out_dtype)

Expand All @@ -372,18 +379,37 @@ def call(self, feats):
assert np.issubdtype(feats.dtype, np.integer) \
or np.issubdtype(feats.dtype, np.floating), \
f"The feature {self.feat_name} has to be integers or floats."
bucket = sorted(self.bucket)
bin_indices = pd.cut(feats, bucket, labels=False)
for i, ele in enumerate(bin_indices):
if pd.notnull(ele):
continue
bin_indices[i] = 0. if feats[i] <= self.bucket[0] \
else self.bucket_cnt - 1
bin_indices = bin_indices.astype(np.int64)

print("num value: ", feats)
encoding = np.zeros((len(feats), self.bucket_cnt), dtype=np.int8)
for i, emb in enumerate(encoding):
emb[bin_indices[i]] = 1
max_val = max(self.range)
min_val = min(self.range)
bucket_size = (max_val - min_val) / self.bucket_cnt
for i, f in enumerate(feats):
high_val = min(f + self.slide_window_size / 2, max_val)
low_val = max(f - self.slide_window_size / 2, min_val)

# Early exits to avoid numpy calls
membership_list = [0.0] * self.bucket_cnt
if f >= max_val:
membership_list[-1] = 1.0
encoding[i] = membership_list
continue
if f <= min_val:
membership_list[0] = 1.0
encoding[i] = membership_list
continue

# Determine upper and lower bucket membership
low_val -= min_val
high_val -= min_val
low_idx = max(round(low_val / bucket_size), 0)
high_idx = min((round(high_val / bucket_size)) + 1, self.bucket_cnt)

idx = np.arange(start=low_idx, stop=high_idx, dtype=int)
membership_list = np.zeros(self.bucket_cnt, dtype=float)
membership_list[idx] = 1.0
encoding[i] = membership_list

return {self.feat_name: encoding}

Expand Down Expand Up @@ -983,13 +1009,22 @@ def parse_feat_ops(confs):
separator = conf['separator'] if 'separator' in conf else None
transform = CategoricalTransform(feat['feature_col'], feat_name,
separator=separator, transform_conf=conf)
elif conf['name'] == 'bucket':
assert 'bucket' in conf, \
"It is required to provide bucket information for bucket feature transform"
bucket = conf['bucket']
elif conf['name'] == 'bucket_numerical':
assert 'bucket_cnt' in conf, \
"It is required to count of bucket information for bucket feature transform"
assert 'range' in conf, \
"It is required to provide range information for bucket feature transform"
bucket_cnt = conf['bucket_cnt']
range = conf['range']
if 'slide_window_size' in conf:
slide_window_size = conf['slide_window_size']
else:
slide_window_size = 0
transform = BucketTransform(feat['feature_col'],
feat_name,
bucket=bucket,
bucket_cnt=bucket_cnt,
range=range,
slide_window_size=slide_window_size,
out_dtype=out_dtype)
else:
raise ValueError('Unknown operation: {}'.format(conf['name']))
Expand Down
26 changes: 21 additions & 5 deletions tests/unit-tests/gconstruct/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,9 @@ def test_classification_processor():

@pytest.mark.parametrize("out_dtype", [None, np.float16])
def test_bucket_transform(out_dtype):
bucket = [10, 20, 30]
transform = BucketTransform("test", "test", bucket=bucket, out_dtype=out_dtype)
bucket_range = [10, 30]
transform = BucketTransform("test", "test", 2,
range=bucket_range, slide_window_size=0, out_dtype=out_dtype)
feats = np.zeros(4)
feats[0], feats[1], feats[2], feats[3] = 1, 11, 21, 31
bucket_feats = transform(feats)
Expand All @@ -650,9 +651,10 @@ def test_bucket_transform(out_dtype):
feats_tar = np.array([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=out_dtype)
assert_equal(bucket_feats['test'], feats_tar)

bucket = [1.1, 2.1, 3.1]
feats[0], feats[1], feats[2], feats[3] = 0.1, 1.2, 2.2, 3.2
transform = BucketTransform("test", "test", bucket=bucket, out_dtype=out_dtype)
bucket_range = [1.1, 2.1, 3.1]
feats[0], feats[1], feats[2], feats[3] = 0.2, 1.2, 2.2, 3.2
transform = BucketTransform("test", "test", 2,
range=bucket_range, slide_window_size=0, out_dtype=out_dtype)
bucket_feats = transform(feats)
if out_dtype is not None:
assert bucket_feats['test'].dtype == np.float16
Expand All @@ -662,6 +664,20 @@ def test_bucket_transform(out_dtype):
feats_tar = np.array([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=out_dtype)
assert_equal(bucket_feats['test'], feats_tar)

bucket_range = [10, 30]
transform = BucketTransform("test", "test", 2,
range=bucket_range, slide_window_size=10, out_dtype=out_dtype)
feats = np.zeros(4)
feats[0], feats[1], feats[2], feats[3] = 1, 11, 21, 31
bucket_feats = transform(feats)
if out_dtype is not None:
assert bucket_feats['test'].dtype == np.float16
else:
assert bucket_feats['test'].dtype == np.float32

feats_tar = np.array([[1, 0], [1, 0], [1, 1], [0, 1]], dtype=out_dtype)
assert_equal(bucket_feats['test'], feats_tar)

if __name__ == '__main__':
test_categorize_transform()
test_get_output_dtype()
Expand Down

0 comments on commit d95ff66

Please sign in to comment.