-
Notifications
You must be signed in to change notification settings - Fork 0
/
senti.py
108 lines (80 loc) · 2.75 KB
/
senti.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 15 10:53:46 2017
@author: oem
"""
import requests
import json
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import math
hdr={'User-Agent': 'ubuntu:Python/politics.single.result:v1.0' +
'(by /u/Jeevithan)'}
url = 'https://www.reddit.com/r/politics/.json'
req = requests.get(url, headers=hdr)
json_data = json.loads(req.text)
posts = json.dumps(json_data['data']['children'], indent=4, sort_keys=True)
data_all = json_data['data']['children']
num_of_posts = 0
while len(data_all) <= 1000:
time.sleep(2)
last = data_all[-1]['data']['name']
url = 'https://www.reddit.com/r/politics/.json?after=' + str(last)
req = requests.get(url, headers=hdr)
data = json.loads(req.text)
data_all += data['data']['children']
if num_of_posts == len(data_all):
break
else:
num_of_posts = len(data_all)
sia = SIA()
pos_list = []
neg_list = []
for post in data_all:
print(post)
res = sia.polarity_scores(post['data']['title'])
print(res)
if res['compound'] > 0.2:
pos_list.append(post['data']['title'])
elif res['compound'] < -0.2:
neg_list.append(post['data']['title'])
with open("pos_news_titles.txt", "w", encoding='utf-8',
errors='ignore') as f_pos:
for post in pos_list:
f_pos.write(post + "\n")
with open("neg_news_titles.txt", "w", encoding='utf-8',
errors='ignore') as f_neg:
for post in neg_list:
f_neg.write(post + "\n")
example = "This is an example sentence! However, it " \
"is a very informative one,"
print(nltk.word_tokenize(example, language='english'))
print(nltk.word_tokenize(example, language='english'))
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(example))
stop_words = set(stopwords.words('english'))
all_words_pos = []
with open("pos_news_titles.txt", "r", encoding='utf-8',
errors='ignore') as f_pos:
for line in f_pos.readlines():
words = tokenizer.tokenize(line)
for w in words:
if w.lower() not in stop_words:
all_words_pos.append(w.lower())
pos_res = nltk.FreqDist(all_words_pos)
print(pos_res.most_common(30))
all_words_neg = []
with open("neg_news_titles.txt", "r", encoding='utf-8',
errors='ignore') as f_neg:
for line in f_neg.readlines():
words = tokenizer.tokenize(line)
for w in words:
if w.lower() not in stop_words:
all_words_neg.append(w.lower())
neg_res = nltk.FreqDist(all_words_neg)
print(neg_res.most_common(30))