-
Notifications
You must be signed in to change notification settings - Fork 12
/
03.ID3.py
129 lines (97 loc) · 3.96 KB
/
03.ID3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
3. Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample.
"""
import math
import csv
from collections import Counter
with open('ds2.csv') as csvFile:
g_data = [tuple(line) for line in csv.reader(csvFile)]
g_headers = g_data[0]
g_data = g_data[1:]
class Node:
def __init__(self, headers, data):
self.decision_attribute = None
self.child = {}
self.headers = headers
self.data = data
self.decision = None
def get_attribute_column(headers, data, attribute):
i = headers.index(attribute)
a_list = [ele[i] for ele in data]
return a_list
def calculate_entropy(probs):
return sum([-prob * math.log(prob, 2) for prob in probs])
def split_data(headers, data, attribute, attr_value):
i = headers.index(attribute)
return [ele for ele in data if ele[i] == attr_value]
def entropy(headers, data, attribute='PlayTennis', gain=False):
cnt = Counter(get_attribute_column(headers, data, attribute)) # Counter calculates the proportion of class
num_instances = len(data)
probs = [x / num_instances for x in cnt.values()] # x means count of each attribute.
if not gain:
return calculate_entropy(probs)
gain = 0
for Class, prob in zip(cnt.keys(), probs):
gain += -prob * entropy(headers, split_data(headers, data, attribute, Class))
return gain
def information_gain(headers, data):
max_gain = -1
max_gain_attribute = None
for attribute in headers: # Find max information gain
if attribute == 'PlayTennis':
continue
gain = entropy(headers, data) + entropy(headers, data, attribute, gain=True)
if gain > max_gain:
max_gain = gain
max_gain_attribute = attribute
return max_gain_attribute
def drop_attribute(headers, data, attribute):
i = headers.index(attribute)
new_headers = [ele for ele in headers if ele != attribute]
new_dataset = [tuple(data[:i] + data[i + 1:]) for data in data]
return new_headers, new_dataset
def most_common_outcome(headers, data):
cnt = Counter(get_attribute_column(headers, data, 'PlayTennis'))
return cnt.most_common(1)[0][0]
def id3(root):
if len(root.headers) == 1:
root.decision = most_common_outcome(root.headers, root.data)
return
outcome_value_set = set(get_attribute_column(root.headers, root.data, 'PlayTennis'))
if len(outcome_value_set) == 1:
root.decision = list(outcome_value_set)[0]
return
max_gain_attribute = information_gain(root.headers, root.data)
root.decision_attribute = max_gain_attribute
for attr_val in set(get_attribute_column(root.headers, root.data, max_gain_attribute)):
child_data = split_data(root.headers, root.data, max_gain_attribute, attr_val)
if child_data is None or len(child_data) == 0:
root.decision = most_common_outcome(root.headers, root.data)
return
(new_headers, new_data) = drop_attribute(root.headers, child_data, max_gain_attribute)
root.child[attr_val] = Node(new_headers, new_data)
id3(root.child[attr_val])
root = Node(g_headers, g_data)
id3(root)
def print_tree(root, disp=""):
if root.decision is not None:
if len(disp) == 0:
print(str(root.decision))
else:
print(disp[:-4] + "THEN " + str(root.decision))
return
for attribute, node in root.child.items():
print_tree(node, disp + "IF {} EQUALS {} AND ".format(root.decision_attribute, attribute))
print("Decision Tree Rules:")
print_tree(root)
"""
Output:
Decision Tree Rules:
IF Outlook EQUALS Rainy AND IF Windy EQUALS True THEN No
IF Outlook EQUALS Rainy AND IF Windy EQUALS False THEN Yes
IF Outlook EQUALS Overcast THEN Yes
IF Outlook EQUALS Sunny AND IF Humidity EQUALS High THEN No
IF Outlook EQUALS Sunny AND IF Humidity EQUALS Normal THEN Yes
"""