-
Notifications
You must be signed in to change notification settings - Fork 0
/
NaiveClassifier.py
127 lines (100 loc) · 2.7 KB
/
NaiveClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import string
import re
from collections import Counter
import json
import nltk
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag, map_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import math
from vocabcreater import vocabulary
def createTokens(content):
content=content.decode('utf-8','ignore').lower()
tok=wordpunct_tokenize(content)
tok.sort()
tokens=[]
for word in tok:
word=word.encode('utf-8','ignore')
if word not in stop:
if re.match('[a-z]{2}',word ):
word = unicode(word, 'utf-8')
tokens.append(word)
#print word#.encode('utf-8','ignore')#.encode('cp850','replace').decode('cp850')
return tokens
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('ADJ'):
return 'a'
elif treebank_tag.startswith('V'):
return 'v'
elif treebank_tag.startswith('NO'):
return 'n'
elif treebank_tag.startswith('ADV'):
return 'r'
else:
return ''
def lem(tokens):
posTagged=nltk.pos_tag(tokens);
lmtzr=WordNetLemmatizer()
array=[]
simplified = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
for i in simplified:
if (get_wordnet_pos(i[1])):
array.append(lmtzr.lemmatize(i[0],pos=(get_wordnet_pos(i[1]))))
else:
array.append(i[0])
#print(array)
return array
def createVocabulary(content):
#tokenize
tokens=createTokens(content)
#print "Tokens Created"
tokens=lem(tokens)
return tokens
def ClassProbablity(category):
d = {}
probablity={}
for i in category:
if i in d:
d[i] = d[i]+1
else:
d[i] = 1
Sum= sum(d)
#print Sum
for i in d:
p=(float)(d[i])/Sum
probablity[i]=p;
print probablity
return probablity
def CountOnVocab(content):
tokens=createVocabulary(content)
d={}
for i in vocabulary:
d[i]=0
for i in tokens:
if i in vocabulary:
d[i] = d[i]+1
for i in d:
if d[i] > 0:
print i,d[i]
print d
return d
#setting stop words
stop=set(stopwords.words('english'))
#print vocabulary
d=pd.read_csv('SampleForNaive.csv')
vocablength=len(vocabulary) #no. of words in vocabulary
Id = list(d.id.unique())
data = d['data'].tolist()
category = d['class'].tolist()
classprobablity=ClassProbablity(category) #class probablities
i=0
for item in category:
for content in data:
if category[i] == item:
CountOnVocab(content)
print data, category