diff --git a/README.md b/README.md index edbb01c..1de4332 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,63 @@ for index, point in enumerate(points): ![Image](https://github.com/kLabUM/rrcf/blob/master/resources/sine.png) +## Obtain feature importance + +This example shows how to estimate the feature importance using the dimension of cut obtained during the calculation of the CoDisp. + + +```python +import numpy as np +import pandas as pd +import rrcf + +# Set parameters +np.random.seed(0) +n = 2010 +d = 3 +num_trees = 100 +tree_size = 256 + +# Generate data +X = np.zeros((n, d)) +X[:1000,0] = 5 +X[1000:2000,0] = -5 +X += 0.01*np.random.randn(*X.shape) + +# Construct forest +forest = [] +while len(forest) < num_trees: + # Select random subsets of points uniformly from point set + ixs = np.random.choice(n, size=(n // tree_size, tree_size), + replace=False) + # Add sampled trees to forest + trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs] + forest.extend(trees) + + +# Compute average CoDisp with the cut dimension for each point +dim_codisp = np.zeros([n,d],dtype=float) +index = np.zeros(n) +for tree in forest: + for leaf in tree.leaves: + codisp,cutdim = tree.codisp_with_cut_dimension(leaf) + + dim_codisp[leaf,cutdim] += codisp + + index[leaf] += 1 + +avg_codisp = dim_codisp.sum(axis=1)/index + +#codisp anomaly threshold and calculate the mean over each feature +feature_importance_anomaly = np.mean(dim_codisp[avg_codisp>50,:],axis=0) +#create a dataframe with the feature importance +df_feature_importance = pd.DataFrame(feature_importance_anomaly,columns=['feature_importance']) +df_feature_importance +``` +![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/feature_importance.png) + + + ## Contributing We welcome contributions to the `rrcf` repo. To contribute, submit a [pull request](https://help.github.com/en/articles/about-pull-requests) to the `dev` branch. diff --git a/docs/index.md b/docs/index.md index 192343c..399241b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -244,6 +244,64 @@ for index, point in enumerate(points): ![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/resources/sine.png) + +## Obtain feature importance + +This example shows how to estimate the feature importance using the dimension of cut obtained during the calculation of the CoDisp. + + +```python +import numpy as np +import pandas as pd +import rrcf + +# Set parameters +np.random.seed(0) +n = 2010 +d = 3 +num_trees = 100 +tree_size = 256 + +# Generate data +X = np.zeros((n, d)) +X[:1000,0] = 5 +X[1000:2000,0] = -5 +X += 0.01*np.random.randn(*X.shape) + +# Construct forest +forest = [] +while len(forest) < num_trees: + # Select random subsets of points uniformly from point set + ixs = np.random.choice(n, size=(n // tree_size, tree_size), + replace=False) + # Add sampled trees to forest + trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs] + forest.extend(trees) + + +# Compute average CoDisp with the cut dimension for each point +dim_codisp = np.zeros([n,d],dtype=float) +index = np.zeros(n) +for tree in forest: + for leaf in tree.leaves: + codisp,cutdim = tree.codisp_with_cut_dimension(leaf) + + dim_codisp[leaf,cutdim] += codisp + + index[leaf] += 1 + +avg_codisp = dim_codisp.sum(axis=1)/index + +#codisp anomaly threshold and calculate the mean over each feature +feature_importance_anomaly = np.mean(dim_codisp[avg_codisp>50,:],axis=0) +#create a dataframe with the feature importance +df_feature_importance = pd.DataFrame(feature_importance_anomaly,columns=['feature_importance']) +df_feature_importance +``` + +![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/feature_importance.png) + + ## Contributing We welcome contributions to the `rrcf` repo. To contribute, submit a [pull request](https://help.github.com/en/articles/about-pull-requests) to the `dev` branch. diff --git a/resources/feature_importance.png b/resources/feature_importance.png new file mode 100644 index 0000000..e223a1a Binary files /dev/null and b/resources/feature_importance.png differ diff --git a/rrcf/rrcf.py b/rrcf/rrcf.py index 8484af1..40b525b 100644 --- a/rrcf/rrcf.py +++ b/rrcf/rrcf.py @@ -633,6 +633,67 @@ def codisp(self, leaf): co_displacement = max(results) return co_displacement + + def codisp_with_cut_dimension(self, leaf): + """ + Compute collusive displacement at leaf and the dimension of the cut. + This method can be used to find the most importance fetures that determined the CoDisp. + + Parameters: + ----------- + leaf: index of leaf or Leaf instance + + Returns: + -------- + codisplacement: float + Collusive displacement if leaf is removed. + cut_dimension: int + Dimension of the cut + + Example: + -------- + # Create RCTree + >>> X = np.random.randn(100, 2) + >>> tree = rrcf.RCTree(X) + >>> new_point = np.array([4, 4]) + >>> tree.insert_point(new_point, index=100) + + # Compute collusive displacement with dimension + >>> tree.codisp_with_cut_dimension(100) + + (31.667, 1) + """ + if not isinstance(leaf, Leaf): + try: + leaf = self.leaves[leaf] + except KeyError: + raise KeyError( + 'leaf must be a Leaf instance or key to self.leaves') + # Handle case where leaf is root + if leaf is self.root: + return 0 + node = leaf + results = [] + cut_dimensions = [] + + for _ in range(node.d): + parent = node.u + if parent is None: + break + if node is parent.l: + sibling = parent.r + else: + sibling = parent.l + num_deleted = node.n + displacement = sibling.n + result = (displacement / num_deleted) + results.append(result) + cut_dimensions.append(parent.q) + node = parent + argmax = np.argmax(results) + + return results[argmax], cut_dimensions[argmax] + def get_bbox(self, branch=None): """ Compute bounding box of all points underneath a given branch. diff --git a/test/test_rrcf.py b/test/test_rrcf.py index ea57c39..7979690 100644 --- a/test/test_rrcf.py +++ b/test/test_rrcf.py @@ -36,6 +36,13 @@ def test_codisp(): codisp = tree.codisp(i) assert codisp > 0 +def test_codisp_with_cut_dimension(): + for i in range(100): + codisp,cut_dim = tree.codisp_with_cut_dimension(i) + assert codisp > 0 + assert cut_dim >= 0 and cut_dim < d + + def test_disp(): for i in range(100): disp = tree.disp(i)