Skip to content

Commit

Permalink
Merge pull request #100 from JavierVeraOlmos/master
Browse files Browse the repository at this point in the history
Added a method to calculate feature importance
  • Loading branch information
mdbartos authored May 22, 2023
2 parents 98f057f + 624ffb7 commit 8175353
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 0 deletions.
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,63 @@ for index, point in enumerate(points):

![Image](https://github.com/kLabUM/rrcf/blob/master/resources/sine.png)

## Obtain feature importance

This example shows how to estimate the feature importance using the dimension of cut obtained during the calculation of the CoDisp.


```python
import numpy as np
import pandas as pd
import rrcf

# Set parameters
np.random.seed(0)
n = 2010
d = 3
num_trees = 100
tree_size = 256

# Generate data
X = np.zeros((n, d))
X[:1000,0] = 5
X[1000:2000,0] = -5
X += 0.01*np.random.randn(*X.shape)

# Construct forest
forest = []
while len(forest) < num_trees:
# Select random subsets of points uniformly from point set
ixs = np.random.choice(n, size=(n // tree_size, tree_size),
replace=False)
# Add sampled trees to forest
trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)


# Compute average CoDisp with the cut dimension for each point
dim_codisp = np.zeros([n,d],dtype=float)
index = np.zeros(n)
for tree in forest:
for leaf in tree.leaves:
codisp,cutdim = tree.codisp_with_cut_dimension(leaf)

dim_codisp[leaf,cutdim] += codisp

index[leaf] += 1

avg_codisp = dim_codisp.sum(axis=1)/index

#codisp anomaly threshold and calculate the mean over each feature
feature_importance_anomaly = np.mean(dim_codisp[avg_codisp>50,:],axis=0)
#create a dataframe with the feature importance
df_feature_importance = pd.DataFrame(feature_importance_anomaly,columns=['feature_importance'])
df_feature_importance
```
![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/feature_importance.png)



## Contributing

We welcome contributions to the `rrcf` repo. To contribute, submit a [pull request](https://help.github.com/en/articles/about-pull-requests) to the `dev` branch.
Expand Down
58 changes: 58 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,64 @@ for index, point in enumerate(points):

![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/resources/sine.png)


## Obtain feature importance

This example shows how to estimate the feature importance using the dimension of cut obtained during the calculation of the CoDisp.


```python
import numpy as np
import pandas as pd
import rrcf

# Set parameters
np.random.seed(0)
n = 2010
d = 3
num_trees = 100
tree_size = 256

# Generate data
X = np.zeros((n, d))
X[:1000,0] = 5
X[1000:2000,0] = -5
X += 0.01*np.random.randn(*X.shape)

# Construct forest
forest = []
while len(forest) < num_trees:
# Select random subsets of points uniformly from point set
ixs = np.random.choice(n, size=(n // tree_size, tree_size),
replace=False)
# Add sampled trees to forest
trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs]
forest.extend(trees)


# Compute average CoDisp with the cut dimension for each point
dim_codisp = np.zeros([n,d],dtype=float)
index = np.zeros(n)
for tree in forest:
for leaf in tree.leaves:
codisp,cutdim = tree.codisp_with_cut_dimension(leaf)

dim_codisp[leaf,cutdim] += codisp

index[leaf] += 1

avg_codisp = dim_codisp.sum(axis=1)/index

#codisp anomaly threshold and calculate the mean over each feature
feature_importance_anomaly = np.mean(dim_codisp[avg_codisp>50,:],axis=0)
#create a dataframe with the feature importance
df_feature_importance = pd.DataFrame(feature_importance_anomaly,columns=['feature_importance'])
df_feature_importance
```

![Image](https://raw.githubusercontent.com/kLabUM/rrcf/master/feature_importance.png)


## Contributing

We welcome contributions to the `rrcf` repo. To contribute, submit a [pull request](https://help.github.com/en/articles/about-pull-requests) to the `dev` branch.
Expand Down
Binary file added resources/feature_importance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
61 changes: 61 additions & 0 deletions rrcf/rrcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,67 @@ def codisp(self, leaf):
co_displacement = max(results)
return co_displacement


def codisp_with_cut_dimension(self, leaf):
"""
Compute collusive displacement at leaf and the dimension of the cut.
This method can be used to find the most importance fetures that determined the CoDisp.
Parameters:
-----------
leaf: index of leaf or Leaf instance
Returns:
--------
codisplacement: float
Collusive displacement if leaf is removed.
cut_dimension: int
Dimension of the cut
Example:
--------
# Create RCTree
>>> X = np.random.randn(100, 2)
>>> tree = rrcf.RCTree(X)
>>> new_point = np.array([4, 4])
>>> tree.insert_point(new_point, index=100)
# Compute collusive displacement with dimension
>>> tree.codisp_with_cut_dimension(100)
(31.667, 1)
"""
if not isinstance(leaf, Leaf):
try:
leaf = self.leaves[leaf]
except KeyError:
raise KeyError(
'leaf must be a Leaf instance or key to self.leaves')
# Handle case where leaf is root
if leaf is self.root:
return 0
node = leaf
results = []
cut_dimensions = []

for _ in range(node.d):
parent = node.u
if parent is None:
break
if node is parent.l:
sibling = parent.r
else:
sibling = parent.l
num_deleted = node.n
displacement = sibling.n
result = (displacement / num_deleted)
results.append(result)
cut_dimensions.append(parent.q)
node = parent
argmax = np.argmax(results)

return results[argmax], cut_dimensions[argmax]

def get_bbox(self, branch=None):
"""
Compute bounding box of all points underneath a given branch.
Expand Down
7 changes: 7 additions & 0 deletions test/test_rrcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ def test_codisp():
codisp = tree.codisp(i)
assert codisp > 0

def test_codisp_with_cut_dimension():
for i in range(100):
codisp,cut_dim = tree.codisp_with_cut_dimension(i)
assert codisp > 0
assert cut_dim >= 0 and cut_dim < d


def test_disp():
for i in range(100):
disp = tree.disp(i)
Expand Down

0 comments on commit 8175353

Please sign in to comment.