-
Notifications
You must be signed in to change notification settings - Fork 0
/
douban.py
90 lines (69 loc) · 2 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import requests
from pyquery import PyQuery as pq
class Model:
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
class Movie(Model):
"""
存储电影信息
"""
def __init__(self):
self.name = ''
self.other = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
def get(url, filename):
folder = 'cached'
if not os.path.exists(folder):
os.makedirs(folder)
path = os.path.join(folder, filename)
if os.path.exists(path):
with open(path, 'rb') as f:
s = f.read()
return s
else:
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
def movie_from_div(div):
"""
从一个 div 里面获取到一个电影信息
"""
e = pq(div)
m = Movie()
m.name = e('.title').text()
m.other = e('.other').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic').find('em').text()
return m
def save_cover(movies):
for m in movies:
filename = '{}.jpg'.format(m.ranking)
get(m.cover_url, filename)
def cached_page(url):
filename = '{}.html'.format(url.split('=', 1)[-1])
page = get(url, filename)
return page
def movies_from_url(url):
page = cached_page(url)
e = pq(page)
items = e('.item')
movies = [movie_from_div(i) for i in items]
save_cover(movies)
return movies
def main():
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()