-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
79 lines (65 loc) · 2 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from urllib.request import urlopen
from socket import timeout
from json import dumps, loads
API_KEY = '47a04b83'
START_WORD = 'Batman'
MAX_PAGE = 5
search_dict = {}
result_dict = {}
total_searches = 0
class SearchResultItem:
def __init__(self, title, id):
self.title = title
self.id = id
class SearchResult:
def __init__(self, total, items):
self.total = total
self.items = items
def search(word, page):
global total_searches
if total_searches > 99: return None
try:
contents = loads(urlopen('http://www.omdbapi.com/?s={}&page={}&apikey={}&type=movie'.format(word, str(page), API_KEY), timeout = 1).read())
except:
total_searches += 1
return None
total_searches += 1
if contents['Response'] != 'True': return None
else:
items = []
for item in contents['Search']:
sri = SearchResultItem(item['Title'], item['imdbID'])
items.append(sri)
return SearchResult(int(contents['totalResults']), items)
def deplete_word(word):
result = search(word, 1)
movies = []
if result != None:
movies += result.items
total = result.total
if total >= 20:
for i in range(2, min(int(total / 10), MAX_PAGE) + 1):
result = search(word, i)
if result != None:
movies += result.items
return movies
def process_word_recursively(word, f, count = 1):
if count > 10: return True
if word in search_dict: return False
search_dict[word] = True
movies = deplete_word(word)
for m in movies:
id = m.id
if id not in result_dict:
f.write('"' + id + '",\n')
result_dict[id] = True
for m in movies:
ts = m.title.split()
print(ts)
if len(ts) > 1 and process_word_recursively(ts[1], f, count + 1):
break
return False
with open('movies', 'w') as f:
f.write('[')
process_word_recursively(START_WORD, f)
f.write(']')