-
Notifications
You must be signed in to change notification settings - Fork 1
/
reddit-scraper.py
executable file
·81 lines (63 loc) · 2.11 KB
/
reddit-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/python
import httplib
import time
from bs4 import BeautifulSoup
from separator import filter_output, separate
URL = "http://www.reddit.com/r/"
HEADER = { 'User-Agent' : 'hubble-tags' }
CONNECTION = httplib.HTTPConnection('www.reddit.com')
REDDIT_CATEGORIES = [
('programming', 'machinelearning', 150),
('programming', 'compsci', 250),
('programming', 'programming', 150),
('business', 'business', 200),
('design', 'web_design', 300),
('design', 'graphic_design', 300),
('entertaiment', 'Music', 150),
('entertaiment', 'movies', 150),
('entertaiment', 'books', 150),
('entertaiment', 'television', 150),
('science', 'science', 100),
('science', 'Physics', 100),
('science', 'chemistry', 100),
('science', 'biology', 100),
('science', 'math', 100),
('security', 'networking', 200),
('security', 'hacking', 200),
('security', 'ComputerSecurity', 200),
('worldnews', 'worldnews', 200),
('worldnews', 'news', 200),
('worldnews', 'truenews', 200),
]
def get_soup(subreddit, partial_url) :
CONNECTION.request('GET', '/r/'+subreddit+partial_url, headers=HEADER)
html = CONNECTION.getresponse().read()
time.sleep(2)
soup= BeautifulSoup(html)
return soup
def get_next_link(soup) :
next_a_tag = soup.find('a', rel='nofollow next')
if next_a_tag is not None : next_link = next_a_tag['href']
else : next_link = ""
return next_link
def print_all_links(reddit, subreddit, soup, size) :
post_a_tags = soup.find_all('a', class_='title may-blank ')
for post_a_tag in post_a_tags :
if post_a_tag['href'][:4] == 'http' :
print(reddit, subreddit, post_a_tag['href'], post_a_tag.string.encode('ascii','ignore'))
size+=1
return size
def main() :
for reddit, subreddit, limit in REDDIT_CATEGORIES :
url = URL + subreddit
print(url)
size = 0
partial_url = ""
while size<=limit :
soup = get_soup(subreddit, partial_url)
next_link = get_next_link(soup)
if next_link == "" : break
partial_url = next_link.split('/')[-1]
size = print_all_links(reddit, subreddit, soup, size)
CONNECTION.close()
if __name__ == "__main__" : main()