-
Notifications
You must be signed in to change notification settings - Fork 0
/
ur.py
128 lines (93 loc) · 3.12 KB
/
ur.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 2 17:03:53 2017
@author: oem
"""
import feedparser
import pycountry
import re
import requests
import json
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import numpy as np
# Function to fetch the rss feed and return the parsed RSS
def parseRSS( rss_url ):
return feedparser.parse( rss_url )
# Function grabs the rss feed headlines (titles) and returns them as a list
def getHeadlines( rss_url ):
headlines = []
feed = parseRSS( rss_url )
for newsitem in feed['items']:
#print(newsitem)
headlines.append(newsitem['summary'])
return headlines
# A list to hold all headlines
allheadlines = []
# List of RSS feeds that we will fetch and combine
newsurls = {
'returespol': 'http://feeds.reuters.com/Reuters/PoliticsNews',
'returesworld': 'http://feeds.reuters.com/Reuters/worldNews'
}
# Iterate over the feed urls
for key,url in newsurls.items():
# Call getHeadlines() and combine the returned headlines with allheadlines
allheadlines.extend( getHeadlines( url ) )
# Iterate over the allheadlines list and print each headline
data_all=[]
for hl in allheadlines:
hl=hl.lower()
result = re.sub('<[^>]+>', '',hl)
#print(hl)
data_all.append(result)
sia = SIA()
pos_list = []
neg_list = []
for post in data_all:
print(post)
res = sia.polarity_scores(post)
print(res)
if res['compound'] > 0.2:
pos_list.append(post)
elif res['compound'] < -0.2:
neg_list.append(post)
with open("pos_news_titles.txt", "w", encoding='utf-8',
errors='ignore') as f_pos:
for post in pos_list:
f_pos.write(post + "\n")
with open("neg_news_titles.txt", "w", encoding='utf-8',
errors='ignore') as f_neg:
for post in neg_list:
f_neg.write(post + "\n")
example = "This is an example sentence! However, it " \
"is a very informative one,"
print(nltk.word_tokenize(example, language='english'))
print(nltk.word_tokenize(example, language='english'))
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(example))
stop_words = set(stopwords.words('english'))
all_words_pos = []
with open("pos_news_titles.txt", "r", encoding='utf-8',
errors='ignore') as f_pos:
for line in f_pos.readlines():
words = tokenizer.tokenize(line)
for w in words:
if w.lower() not in stop_words:
all_words_pos.append(w.lower())
pos_res = nltk.FreqDist(all_words_pos)
print(pos_res.most_common(30))
all_words_neg = []
with open("neg_news_titles.txt", "r", encoding='utf-8',
errors='ignore') as f_neg:
for line in f_neg.readlines():
words = tokenizer.tokenize(line)
for w in words:
if w.lower() not in stop_words:
all_words_neg.append(w.lower())
neg_res = nltk.FreqDist(all_words_neg)
print(neg_res.most_common(30))