-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
137 lines (109 loc) · 3.78 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from storage import MongoStorage, FileStorage
from config import BASE_LINK, STORAGE_TYPE
from parser import AdvertisementPageParser
class CrawlerBase(ABC):
def __init__(self):
self.storage = self.__set_storage()
@staticmethod
def __set_storage():
if STORAGE_TYPE == 'mongo':
return MongoStorage()
return FileStorage()
@abstractmethod
def start(self, store=False):
pass
@abstractmethod
def store(self, data):
pass
@staticmethod
def get(link):
try:
response = requests.get(link)
except requests.HTTPError:
return None
return response
@staticmethod
def store(self, data, filename=None):
pass
class LinkCrawler(CrawlerBase):
def __init__(self, cities, link=BASE_LINK):
self.cities = cities
self.link = link
super().__init__()
def find_links(self, html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
return soup.find_all('a', attrs={"class", "hdrlnk"})
def start_crawl_city(self, url):
start = 0
crawl = True
ad_list = list()
while crawl:
offers = self.get(url + str(start))
new_list = self.find_links(offers.text)
ad_list.extend(new_list)
start += 120
crawl = bool(len(new_list))
return ad_list
def start(self, store):
adv_links = list()
for city in self.cities:
links = self.start_crawl_city(self.link.format(city))
print('total', city, len(links))
adv_links.extend(links)
if store:
self.store([{'url': li.get('href'), 'flag': False} for li in adv_links])
return adv_links
def store(self, data, *args):
self.storage.store(data, 'advertisement_links')
class DataCrawler(CrawlerBase):
def __init__(self):
super().__init__()
self.links = self.__load_links()
self.parser = AdvertisementPageParser()
def __load_links(self):
return self.storage.load('advertisement_links', {'flag': False})
def start(self, store):
for link in self.links:
response = self.get(link['url'])
data = self.parser.parse(response.text)
if store:
self.store(data, data.get('post_id', 'sample'))
self.storage.update_flag(data)
def store(self, data, filename):
self.storage.store(data, 'advertisement_data')
print(data['post_id'])
class ImageDownloader(CrawlerBase):
def __init__(self, *args, **kwargs):
super().__init__()
self.advertisements = self.__load_advertisements()
def __load_advertisements(self):
return self.storage.load('advertisement_data')
@staticmethod
def get(link):
try:
response = requests.get(link, stream=True)
except requests.HTTPError:
return None
return response
def start(self, store=True):
for advertisement in self.advertisements:
counter = 1
for image in advertisement['images']:
response = self.get(image['url'])
if store:
self.store(response, advertisement['post_id'], counter)
counter += 1
def store(self, data, adv_id, img_number):
filename = f'{adv_id} - {img_number}'
return self.save_to_disc(data, filename)
def save_to_disc(self, response, filename):
with open(f'fixtures/images/{filename}.jpg', 'ab') as f:
f.write(response.content)
for _ in response.iter_content():
f.write(response.content)
print(filename)
return filename