-
Notifications
You must be signed in to change notification settings - Fork 1
/
sentiment_analysis.py
135 lines (113 loc) · 5.42 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
ROOT_POLARITY_DATA_DIR = os.path.join('polarityData','rt-polaritydata')
POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata', 'review_polarity','txt_sentoken')
POS_DIRECTORY = os.path.join(POLARITY_DATA_DIR,'pos')
NEG_DIRECTORY = os.path.join(POLARITY_DATA_DIR,'neg')
RT_POLARITY_POS_FILE = os.path.join(ROOT_POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
RT_POLARITY_NEG_FILE = os.path.join(ROOT_POLARITY_DATA_DIR, 'rt-polarity-neg.txt')
USER_DEFINED_NEG_FILE = os.path.join(ROOT_POLARITY_DATA_DIR,'user_defined_neg.txt')
#this function takes a feature selection mechanism and returns its performance in a variety of metrics
def evaluate_features(feature_select):
posFeatures = []
negFeatures = []
#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
for pos_file in os.listdir(POS_DIRECTORY):
fileName = os.path.join(POS_DIRECTORY,pos_file)
with open(fileName, 'r') as posSentences:
for i in posSentences:
posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords = [feature_select(posWords), 'pos']
posFeatures.append(posWords)
for neg_file in os.listdir(NEG_DIRECTORY):
fileNameNeg = os.path.join(NEG_DIRECTORY,neg_file)
with open(fileNameNeg, 'r') as negSentences:
for i in negSentences:
negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords = [feature_select(negWords), 'neg']
negFeatures.append(negWords)
posCutoff = int(math.floor(len(posFeatures)))
negCutoff = int(math.floor(len(negFeatures)))
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
#selects 3/4 of the features to be used for training and 1/4 to be used for testing
posCutoff = int(math.floor(len(posFeatures)))
negCutoff = int(math.floor(len(negFeatures)))
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
#trains a Naive Bayes Classifier
global classifier
classifier = NaiveBayesClassifier.train(trainFeatures)
#scores words based on chi-squared test to show information gain (http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/)
def create_word_scores():
#creates lists of all positive and negative words
posWords = []
negWords = []
for pos_file in os.listdir(POS_DIRECTORY):
fileName = os.path.join(POS_DIRECTORY,pos_file)
with open(fileName, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
for neg_file in os.listdir(NEG_DIRECTORY):
fileNameNeg = os.path.join(NEG_DIRECTORY,neg_file)
with open(fileNameNeg, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
#finds the best 'number' words based on word scores
def find_best_words(word_scores, number):
best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
best_words = set([w for w, s in best_vals])
return best_words
#creates feature selection mechanism that only uses best words
def best_word_features(words):
return dict([(word, True) for word in words if word in best_words])
def get_sentiment(tweet):
tweetWords = re.findall(r"[\w']+|[.,!?;]", tweet.rstrip())
tweetWords = [best_word_features(tweetWords),'']
tweetsWordFeatures = []
tweetsWordFeatures.append(tweetWords)
testFeatures = tweetsWordFeatures
for i, (features, label) in enumerate(testFeatures):
predicted = classifier.classify(features)
return predicted
def initiate_classifier():
#finds word scores
word_scores = create_word_scores()
#numbers of features to select
numbers_to_test = [750]
#tries the best_word_features mechanism with each of the numbers_to_test of features
for num in numbers_to_test:
print 'evaluating best %d word features' % (num)
global best_words
best_words = find_best_words(word_scores, num)
evaluate_features(best_word_features)
initiate_classifier()
predicted = get_sentiment("Worst customer service i experienced ")
print predicted