-
Notifications
You must be signed in to change notification settings - Fork 0
/
getMentions.py
69 lines (50 loc) · 1.61 KB
/
getMentions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
from bs4 import BeautifulSoup
import re
import nltk
import requests
import pandas as pd
from tickList import tickList
# Read in news dataframe
newsdf = pd.read_csv("mentions.csv")
for i in range(300,len(newsdf)-1):
# Get link
link = newsdf["Link"][i]
print("Link #{}: {}".format(i, link))
time.sleep(1)
try:
# Get page source
r = requests.get(link, timeout=10)
print("Status: ", r.status_code)
# Retry if status code is not 200
tryCount = 0
while r.status_code != 200 and tryCount < 5:
print("Retrying...", tryCount)
r = requests.get(link, timeout=10)
time.sleep(1)
tryCount += 1
except:
# If error, skip link
print("Error")
continue
#Get page source and tokenizes data
html = r.content
# Create BeautifulSoup object
soup = BeautifulSoup(html, "html.parser")
# Get all text from page
pageSource = " ".join(soup.strings)
pageSource = re.sub(r"\s+", " ", pageSource)
# Tokenize page source
tokens = nltk.word_tokenize(pageSource)
# Return all mentions of tickers in page source
tickMentioned = []
for token in tokens:
if token in tickList:
tickMentioned.append(token)
# Remove duplicates
tickMentioned = list(set(tickMentioned))
print(tickMentioned)
# Add tickers mentioned to newsdf
newsdf["Tickers Mentioned"][i] = tickMentioned
# Save dataframe to csv
newsdf.to_csv("mentions.csv", index=False)