Skip to content

Commit

Permalink
Tuesday 11:45 PM
Browse files Browse the repository at this point in the history
  • Loading branch information
Tabaie committed Oct 14, 2015
1 parent d2cee8d commit 4fec0a8
Show file tree
Hide file tree
Showing 6 changed files with 3,219 additions and 1,490 deletions.
184 changes: 147 additions & 37 deletions KR_A1.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def __str__(self):

#MAIN ROUTINE

random.seed()

#READ DATA FROM FILE
f= open("cmc.data", "rt")
Expand All @@ -106,8 +105,8 @@ def __str__(self):
i= i+1


if i==5:
break
# if i==100:
# break

f.close()

Expand All @@ -116,14 +115,15 @@ def __str__(self):
Record.categoricalCount= len(data[0].categoricals)

#CREATE test and training data sets
k= 3
k= 10
random.seed()
random.shuffle(data)
test= np.array(data[:len(data)/k])
train= data[len(data)/k :]
train= np.array(data[len(data)/k :])

data = np.array(data) #THIS SHOULD NOT BE REMOVED

data = np.array(data)

#print [r for r in train]

#COMPUTING ENTROPY OF A DATA SET
def Entropy(data):
Expand Down Expand Up @@ -153,69 +153,167 @@ class Node:
binary=1
numerical=2

# dividingAttrT #Categorical, Boolean or Number
# dividingAttrI #Index among its own type
AttrTypeName=["categorical", "binary", "numerical"]

# parent
nodeNum=0

# data

# children
def __init__(self, level, data):
self.level=level
self.data=data
self.children= []
self.nodeNum= Node.nodeNum
Node.nodeNum= Node.nodeNum+1
self.SetIsLeaf()

def SetIsLeaf(self):
self.isLeaf = (self.level == 10) or (len(self.data)==4) or (Entropy(self.data)<1.2)

def SetFinalLabel(self): #Leaf node, take the label that has majority

if (len(self.data)==0):
self.finalLabel=1
else:
tgts= [ r.tgt.val for r in self.data]
counts= np.array([ tgts.count(v) for v in Contraception.vals])
self.finalLabel= np.argmax(counts) + Contraception.No

def CreateTree(self):
if self.isLeaf:
self.SetFinalLabel()
else:
self.SetBestSplitAttr()
self.CreateSubTrees()

self.LogCreation()


def LogData(self):
for r in self.data:
print r

def SplitEntropy(self, dividingAttrT, dividingAttrI):
def LogCreation(self):
print "Node",self.nodeNum

# self.LogData()

if self.isLeaf:
print "Node leaf with data entropy=",Entropy(self.data),", no more branching"
print "Always predict", self.finalLabel
else:

print "dividing by", Node.AttrTypeName[self.dividingAttrT], "no", self.dividingAttrI
if self.dividingAttrT== Node.numerical:
print "threshold=", self.dividingThresh

print "\tChildren",
for ch in self.children:
print "Node", ch.nodeNum,

print ""

if (dividingAttrT== Node.numerical):
vals= np.array([r.numericals[dividingAttrI] for r in self.data])
return 200000000000000000000000

def Classify(self,r):
if self.isLeaf:
return self.finalLabel
elif self.dividingAttrT== Node.binary:
if r.binaries[self.dividingAttrI]:
i=1
else:
i=0
elif self.dividingAttrT== Node.numerical:
if r.numericals[self.dividingAttrI]< self.dividingThresh:
i=0
else:
i=1
elif self.dividingAttrT== Node.categorical:
i=r.categoricals[self.dividingAttrI].val - CatAttr.low
return self.children[i].Classify(r)

def CreateSubTrees(self):
if (self.dividingAttrT== Node.numerical):
dataDivisions= self.DivideNumerical(self.dividingAttrI, self.dividingThresh)
else:
dataDivisions= self.DivideNonNumerical(self.dividingAttrT, self.dividingAttrI)

elif (dividingAttrT== Node.binary):
vals= np.array([1 if r.binaries[dividingAttrI] else 0 for r in self.data])
possibleVals= [0,1]
self.children= [ Node(self.level+1, d) for d in dataDivisions]

for ch in self.children:
ch.CreateTree()


def DivideNonNumerical(self, dividingAttrT, dividingAttrI):
if (dividingAttrT== Node.binary):
vals= np.array([r.binaries[dividingAttrI] for r in self.data])
possibleVals= [False,True]
elif (dividingAttrT== Node.categorical):
vals= np.array([r.categoricals[dividingAttrI].val for r in self.data])
print vals
possibleVals= CatAttr.vals

valI=[ np.where(vals==val) for val in possibleVals]

dats= [ self.data[i[0] ] for i in valI]

sizeInv= 1.0/ len(self.data)

return [ self.data[i[0] ] for i in valI]


def DivideNumerical(self, dividingAttrI, thresh):
vals= np.array([r.numericals[dividingAttrI] for r in self.data])
valI=[ np.where( (vals>=thresh) == truth) for truth in [False, True] ]
return [self.data[i[0] ] for i in valI]

def SplitEntropyNonNumerical(self, dividingAttrT, dividingAttrI):
return self.SplitEntropy(self.DivideNonNumerical(dividingAttrT, dividingAttrI))

def SplitEntropy(self,dats):
sizeInv= 1.0/ len(self.data)
return sum([ sizeInv * Entropy(dat) * len(dat) for dat in dats])


def SplitEntropyNumerical(self, dividingAttrI, thresh):
return self.SplitEntropy(self.DivideNumerical(dividingAttrI, thresh))

def SetBestSplitAttr(self):

#CATEGORICAL SPLIT... JUST REMEMBER Node.categorical=0
split= np.array([ self.SplitEntropy(Node.categorical, i) for i in xrange(Record.categoricalCount)])
split= np.array([ self.SplitEntropyNonNumerical(Node.categorical, i) for i in xrange(Record.categoricalCount)])
bestI= np.array([np.argmin(split), 0, 0])
best= np.array( [split[bestI[0] ], 0, 22220])

#BINARY SPLIT
split= np.array([ self.SplitEntropy(Node.binary, i) for i in xrange(Record.binaryCount)])
split= np.array([ self.SplitEntropyNonNumerical(Node.binary, i) for i in xrange(Record.binaryCount)])
bestI[Node.binary]= np.argmin(split)
best[Node.binary]= split[bestI[Node.binary]]

#NUMERICAL SPLIT
regressionThreshes= [ [
regressionThreshes= [ [ self.RegressionThresh(i, t) for t in Contraception.vals ] for i in xrange(Record.numericalCount)]

split= np.array([ np.array([ self.SplitEntropyNumerical(i,regressionThreshes[i][t- Contraception.No]) for t in Contraception.vals ]) for i in xrange(Record.numericalCount)])

print bestI
print best
BestTgtIforCatI= np.array( [ np.argmin(split[i]) for i in xrange(Record.numericalCount)])
BestforCatI= np.array( [split[i][BestTgtIforCatI[i]] for i in xrange(Record.numericalCount)] )

bestofbest= np.argmin(best)
bestI[Node.numerical]=np.argmin(BestforCatI)
best[Node.numerical]= BestforCatI[bestI[Node.numerical]]
self.dividingThresh= regressionThreshes[bestI[Node.numerical]][BestTgtIforCatI[bestI[Node.numerical]]]

self.dividingTgt= BestTgtIforCatI[bestI[Node.numerical]]
bestofbest= np.argmin(best)
self.dividingAttrI= bestI[bestofbest]
self.dividingAttrT= bestofbest




def RegressionThresh(self, dividingAttrI, tgtToIsolate):

vals= [ (r.numericals[dividingAttrI], 1 if r.tgt.val==tgtToIsolate else -1) for r in self.data]

xs= [val[0] for val in vals]

xySum= sum([val[0]*val[1] for val in vals])

wInv= float(sum([x*x for x in xs])) / sum([val[0]*val[1] for val in vals])
if xySum==.0:
wInv=999999 #A very large number,just to avoid division by zero
else:
wInv= float(sum([x*x for x in xs])) / xySum

nInv= 1.0/len(self.data)

Expand All @@ -227,9 +325,21 @@ def RegressionThresh(self, dividingAttrI, tgtToIsolate):
xs.sort()
return xs[len(xs)/2] #the median

root= Node()
root.data= data
print root.RegressionThresh(0,Contraception.No)
#root.SetBestSplitAttr()
#print root.SplitEntropy(Node.categorical, 0)
root= Node(0,train)
root.CreateTree()

correct=0
for r in test:
prediction=root.Classify(r)
print "Prediction", prediction, "Class", r.tgt.val
if (prediction== r.tgt.val):
correct= correct +1

print float(correct)/len(test)

correct2=0
for r in data:
if root.Classify(r)== r.tgt.val:
correct2+=1

print float(correct2)/len(data)
Loading

0 comments on commit 4fec0a8

Please sign in to comment.