Tuesday 11:45 PM

Tabaie · Oct 14, 2015 · 4fec0a8 · 4fec0a8
1 parent d2cee8d
commit 4fec0a8
Show file tree

Hide file tree

Showing 6 changed files with 3,219 additions and 1,490 deletions.
diff --git a/KR_A1.py b/KR_A1.py
@@ -84,7 +84,6 @@ def __str__(self):
 
 #MAIN ROUTINE		
 
-random.seed()
 
 #READ DATA FROM FILE
 f= open("cmc.data", "rt")
@@ -106,8 +105,8 @@ def __str__(self):
 	i= i+1	
 
 
-	if i==5:
-		break
+#	if i==100:
+#		break
 
 f.close()
 
@@ -116,14 +115,15 @@ def __str__(self):
 Record.categoricalCount= len(data[0].categoricals)
 
 #CREATE test and training data sets
-k= 3
+k= 10
+random.seed()
 random.shuffle(data)
 test= np.array(data[:len(data)/k])
-train= data[len(data)/k :]
+train= np.array(data[len(data)/k :])
+
+data = np.array(data) #THIS SHOULD NOT BE REMOVED
 
-data = np.array(data)
 
-#print [r for r in train]
 
 #COMPUTING ENTROPY OF A DATA SET
 def Entropy(data):
@@ -153,69 +153,167 @@ class Node:
 	binary=1
 	numerical=2
 
-#	dividingAttrT #Categorical, Boolean or Number
-#	dividingAttrI #Index among its own type
+	AttrTypeName=["categorical", "binary", "numerical"]
 
-#	parent
+	nodeNum=0	
 
-#	data
-
-#	children
+	def __init__(self, level, data):
+		self.level=level
+		self.data=data
+		self.children= []
+		self.nodeNum= Node.nodeNum
+		Node.nodeNum= Node.nodeNum+1
+		self.SetIsLeaf()
+
+	def SetIsLeaf(self):
+		self.isLeaf = (self.level == 10) or (len(self.data)==4) or (Entropy(self.data)<1.2)
+
+	def SetFinalLabel(self):	#Leaf node, take the label that has majority
+
+		if (len(self.data)==0):
+			self.finalLabel=1
+		else:
+			tgts= [ r.tgt.val for r in self.data]
+			counts= np.array([ tgts.count(v) for v in Contraception.vals])
+			self.finalLabel= np.argmax(counts) + Contraception.No
+
+	def CreateTree(self):
+		if self.isLeaf:
+			self.SetFinalLabel()
+		else:
+			self.SetBestSplitAttr()
+			self.CreateSubTrees()
+
+		self.LogCreation()
+
+
+	def LogData(self):
+		for r in self.data:
+			print r
 
-	def SplitEntropy(self, dividingAttrT, dividingAttrI):
+	def LogCreation(self):
+		print "Node",self.nodeNum
+
+#		self.LogData()
+
+		if self.isLeaf:
+			print "Node leaf with data entropy=",Entropy(self.data),", no more branching"
+			print "Always predict", self.finalLabel
+		else:
+
+			print "dividing by", Node.AttrTypeName[self.dividingAttrT], "no", self.dividingAttrI
+			if self.dividingAttrT== Node.numerical:
+				print "threshold=", self.dividingThresh
+
+			print "\tChildren",
+			for ch in self.children:
+				print "Node", ch.nodeNum,
+
+			print ""
 
-		if (dividingAttrT== Node.numerical):
-			vals= np.array([r.numericals[dividingAttrI] for r in self.data])
-			return 200000000000000000000000
+
+	def Classify(self,r):
+		if self.isLeaf:
+			return self.finalLabel
+		elif self.dividingAttrT== Node.binary:
+			if r.binaries[self.dividingAttrI]:
+				i=1
+			else:
+				i=0
+		elif self.dividingAttrT== Node.numerical:
+			if r.numericals[self.dividingAttrI]< self.dividingThresh:
+				i=0
+			else:
+				i=1
+		elif self.dividingAttrT== Node.categorical:
+			i=r.categoricals[self.dividingAttrI].val - CatAttr.low
+		return self.children[i].Classify(r) 
+
+	def CreateSubTrees(self):
+		if (self.dividingAttrT== Node.numerical):
+			dataDivisions= self.DivideNumerical(self.dividingAttrI, self.dividingThresh)
+		else:
+			dataDivisions= self.DivideNonNumerical(self.dividingAttrT, self.dividingAttrI) 
 
-		elif (dividingAttrT== Node.binary):
-			vals= np.array([1 if r.binaries[dividingAttrI] else 0 for r in self.data])
-			possibleVals= [0,1]
+		self.children= [ Node(self.level+1, d) for d in dataDivisions]
+
+		for ch in self.children:
+			ch.CreateTree()
+
+
+	def DivideNonNumerical(self, dividingAttrT, dividingAttrI):
+		if (dividingAttrT== Node.binary):
+			vals= np.array([r.binaries[dividingAttrI] for r in self.data])
+			possibleVals= [False,True]
 		elif (dividingAttrT== Node.categorical):
 			vals= np.array([r.categoricals[dividingAttrI].val for r in self.data])
-			print vals
 			possibleVals= CatAttr.vals
 
 		valI=[ np.where(vals==val) for val in possibleVals]
 
-		dats= [ self.data[i[0] ] for i in valI]
-
-		sizeInv= 1.0/ len(self.data)
-
+		return [ self.data[i[0] ] for i in valI]
+
+
+	def DivideNumerical(self, dividingAttrI, thresh):
+		vals= np.array([r.numericals[dividingAttrI] for r in self.data])
+		valI=[ np.where( (vals>=thresh) == truth) for truth in [False, True] ]
+		return [self.data[i[0] ] for i in valI]
+
+	def SplitEntropyNonNumerical(self, dividingAttrT, dividingAttrI):	
+		return self.SplitEntropy(self.DivideNonNumerical(dividingAttrT, dividingAttrI))
+
+	def SplitEntropy(self,dats):
+		sizeInv= 1.0/ len(self.data)				
 		return sum([ sizeInv * Entropy(dat) * len(dat) for dat in dats])
+
 
+	def SplitEntropyNumerical(self, dividingAttrI, thresh):
+		return self.SplitEntropy(self.DivideNumerical(dividingAttrI, thresh))
 
 	def SetBestSplitAttr(self):
 
 		#CATEGORICAL SPLIT... JUST REMEMBER Node.categorical=0
-		split= np.array([ self.SplitEntropy(Node.categorical, i) for i in xrange(Record.categoricalCount)])
+		split= np.array([ self.SplitEntropyNonNumerical(Node.categorical, i) for i in xrange(Record.categoricalCount)])
 		bestI= np.array([np.argmin(split), 0, 0])
 		best= np.array( [split[bestI[0] ], 0, 22220])
 
 		#BINARY SPLIT
-		split= np.array([ self.SplitEntropy(Node.binary, i) for i in xrange(Record.binaryCount)])
+		split= np.array([ self.SplitEntropyNonNumerical(Node.binary, i) for i in xrange(Record.binaryCount)])
 		bestI[Node.binary]= np.argmin(split)
 		best[Node.binary]= split[bestI[Node.binary]]
 
 		#NUMERICAL SPLIT
-		regressionThreshes= [ [
+		regressionThreshes= [ [ self.RegressionThresh(i, t) for t in Contraception.vals ] for i in xrange(Record.numericalCount)]
+
+		split= np.array([ np.array([ self.SplitEntropyNumerical(i,regressionThreshes[i][t- Contraception.No]) for t in Contraception.vals ]) for i in xrange(Record.numericalCount)])
 
-		print bestI
-		print best
+		BestTgtIforCatI= np.array( [ np.argmin(split[i]) for i in xrange(Record.numericalCount)])
+		BestforCatI= np.array( [split[i][BestTgtIforCatI[i]] for i in xrange(Record.numericalCount)] )
 
-		bestofbest= np.argmin(best)
+		bestI[Node.numerical]=np.argmin(BestforCatI)
+		best[Node.numerical]= BestforCatI[bestI[Node.numerical]]
+		self.dividingThresh= regressionThreshes[bestI[Node.numerical]][BestTgtIforCatI[bestI[Node.numerical]]]
 
+		self.dividingTgt= BestTgtIforCatI[bestI[Node.numerical]]
+		bestofbest= np.argmin(best)
 		self.dividingAttrI= bestI[bestofbest]
 		self.dividingAttrT= bestofbest
+
+
 
 
 	def RegressionThresh(self, dividingAttrI, tgtToIsolate):
 
 		vals= [ (r.numericals[dividingAttrI], 1 if r.tgt.val==tgtToIsolate else -1) for r in self.data]
 
 		xs= [val[0] for val in vals]
+
+		xySum= sum([val[0]*val[1] for val in vals])
 
-		wInv= float(sum([x*x for x in xs])) / sum([val[0]*val[1] for val in vals])	
+		if xySum==.0:
+			wInv=999999 #A very large number,just to avoid division by zero
+		else:
+			wInv= float(sum([x*x for x in xs])) / xySum
 
 		nInv= 1.0/len(self.data)
 
@@ -227,9 +325,21 @@ def RegressionThresh(self, dividingAttrI, tgtToIsolate):
 			xs.sort()
 			return xs[len(xs)/2]	#the median 
 
-root= Node()
-root.data= data
-print root.RegressionThresh(0,Contraception.No)
-#root.SetBestSplitAttr()
-#print root.SplitEntropy(Node.categorical, 0)
+root= Node(0,train)
+root.CreateTree()
 
+correct=0
+for r in test:
+	prediction=root.Classify(r)
+	print "Prediction", prediction, "Class", r.tgt.val
+	if (prediction== r.tgt.val):
+		correct= correct +1
+
+print float(correct)/len(test)
+
+correct2=0
+for r in data:
+	if root.Classify(r)== r.tgt.val:
+		correct2+=1
+
+print float(correct2)/len(data)