-
Notifications
You must be signed in to change notification settings - Fork 21
/
decision_tree_id3.py
79 lines (63 loc) · 2.58 KB
/
decision_tree_id3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import treelib
import scipy.stats
class ID3():
def __init__(self):
self.__tree = treelib.Tree()
def __get_entropy(self, y):
_, counts = np.unique(y, return_counts=True)
prob_classes = counts / np.sum(counts)
return scipy.stats.entropy(prob_classes)
def __create_tree(self, parent, X, y):
n_samples, n_features = X.shape
if n_samples == 0:
return
if len(np.unique(y)) == 1 or (X == X[0]).all():
self.__tree.update_node(parent.identifier, data=max(set(y), key=y.tolist().count))
return
info_gain_max = -np.inf
for i in range(n_features):
if len(np.unique(X[:, i])) == 1:
continue
y_subs = [y[np.flatnonzero(X[:, i] == feature_label)] for feature_label in np.unique(X[:, i])]
info_gain = self.__get_info_gain(y_subs, y)
if info_gain > info_gain_max:
info_gain_max = info_gain
feature_split = i
self.__tree.update_node(parent.identifier, data=feature_split)
for feature_label in np.unique(X[:, feature_split]):
node = self.__tree.create_node(feature_label, parent=parent)
self.__create_tree(node, X[np.flatnonzero(X[:, feature_split] == feature_label)], y[np.flatnonzero(X[:, feature_split] == feature_label)])
def __get_info_gain(self, y_subs, y):
return self.__get_entropy(y) - sum([self.__get_entropy(y_sub) * len(y_sub) for y_sub in y_subs]) / len(y)
def fit(self, X, y):
'''
Parameters
----------
X : shape (n_samples, n_features)
Training data, must be discrete value
y : shape (n_samples,)
Target values
'''
root = self.__tree.create_node('root')
self.__create_tree(root, X, y)
self.__tree.show()
def __query(self, x, node):
if node.is_leaf():
return node.data
feature_split = node.data
for child in self.__tree.children(node.identifier):
if x[feature_split] == child.tag:
return self.__query(x, child)
def predict(self, X):
'''
Parameters
----------
X : shape (n_samples, n_features)
Predicting data, must be discrete value
Returns
-------
y : shape (n_samples,)
Predicted class label per sample
'''
return np.apply_along_axis(self.__query, 1, X, self.__tree.get_node(self.__tree.root))