-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap_quotes.py
95 lines (82 loc) · 2.8 KB
/
scrap_quotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from bs4 import BeautifulSoup
from urllib import urlopen
import re
import time
import pymysql
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock',
user='root', password=password, db='mysql',
charset='utf8')
def writeToDb(quotes):
cur = conn.cursor()
cur.execute('USE ScrapQuotes')
try:
for quote, author, author_about, tags in quotes:
cur.execute("""
SELECT auth_name
FROM authors
WHERE auth_name = %s
""", (author,))
if not cur.fetchone():
cur.execute("""
INSERT INTO authors (auth_name, auth_about)
VALUES (%s, %s)
""", (author, author_about,))
cur.execute("""
INSERT INTO quotes (quote_text, auth_id, quote_tags)
VALUES (
%s,
(SELECT auth_id
FROM authors
WHERE auth_name = %s),
%s
)
""", (quote, author, ' '.join(tags),))
except:
cur.close()
conn.close()
cur.close()
conn.commit()
def getQuotes(page):
quotes = []
for quoteBlock in page.findAll('div', {'class': 'quote'}):
quote = quoteBlock.find('span', {'class': 'text'}).get_text()
author = quoteBlock.find('small', {'class': 'author'}).get_text()
author_about = 'http://quotes.toscrape.com' + \
quoteBlock.find('a', {'href': re.compile('^/author/.*')})['href']
tags = []
for tag in quoteBlock.findAll('a', {'class': 'tag'}):
tags.append(tag.get_text())
quotes.append((quote, author, author_about, tags))
return quotes
def getNextPage(page):
try:
next_button = page.find('li', {'class': 'next'})
next_page = next_button.find('a', href=re.compile('^/page/[0-9]+/$'))['href']
except:
return None
next_url = 'http://quotes.toscrape.com' + next_page
return next_url
def parse(url):
while True:
try:
html = urlopen(url)
break
except:
print "Can't reach " + url
print "Going to timeout for a minute"
time.sleep(60)
bsObj = BeautifulSoup(html)
quotes = getQuotes(bsObj)
writeToDb(quotes)
next_url = getNextPage(bsObj)
if next_url:
time.sleep(3)
parse(next_url)
else:
print "Reached the final page"
if __name__ == '__main__':
start_url = 'http://quotes.toscrape.com'
try:
parse(start_url)
finally:
conn.close()