Skip to content

Commit

Permalink
Merge pull request NodyHub#16 from codders/feat/crawl-and-filter-dates
Browse files Browse the repository at this point in the history
Improve efficiency of IDMaintainer, extract 'from' dates
  • Loading branch information
codders authored Jun 11, 2020
2 parents 72351a1 + 2b34eca commit b2972b8
Show file tree
Hide file tree
Showing 17 changed files with 231 additions and 103 deletions.
6 changes: 5 additions & 1 deletion flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ def crawl(self, url, max_pages=None):
return []

def get_name(self):
return type(self).__name__
return type(self).__name__

def get_expose_details(self, expose):
# Implement in subclass - extract additional data by processing the expose URL
return expose
34 changes: 30 additions & 4 deletions flathunter/crawl_ebaykleinanzeigen.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
import logging
import requests
import re
import datetime
from bs4 import BeautifulSoup
from flathunter.abstract_crawler import Crawler

class CrawlEbayKleinanzeigen(Crawler):
__log__ = logging.getLogger(__name__)
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
URL_PATTERN = re.compile(r'https://www\.ebay-kleinanzeigen\.de')
MONTHS = {
"Januar": "01",
"Februar": "02",
"März": "03",
"April": "04",
"Mai": "05",
"Juni": "06",
"Juli": "07",
"August": "08",
"September": "09",
"Oktober": "10",
"November": "11",
"Dezember": "12"
}

def __init__(self):
logging.getLogger("requests").setLevel(logging.WARNING)
Expand All @@ -29,6 +44,17 @@ def get_page(self, search_url):
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
return BeautifulSoup(resp.content, 'html.parser')

def get_expose_details(self, expose):
soup = self.get_page(expose['url'])
for detail in soup.find_all('li', { "class": "addetailslist--detail" }):
if re.match(r'Verfügbar ab', detail.text):
date_string = re.match(r'(\w+) (\d{4})', detail.text)
if date_string is not None:
expose['from'] = "01." + self.MONTHS[date_string[1]] + "." + date_string[2]
if 'from' not in expose:
expose['from'] = datetime.datetime.now().strftime('%02d.%02m.%Y')
return expose

def extract_data(self, soup):
entries = list()
soup = soup.find(id="srchrslt-adtable")
Expand Down Expand Up @@ -57,14 +83,14 @@ def extract_data(self, soup):
address = address.replace('\n', ' ').replace('\r', '')
address = " ".join(address.split())
try:
self.__log__.debug(tags[0].text)
rooms = tags[0].text
self.__log__.debug(tags[1].text)
rooms = re.match(r'(\d+)', tags[1].text)[1]
except IndexError:
self.__log__.debug("Keine Zimmeranzahl gegeben")
rooms = "Nicht gegeben"
try:
self.__log__.debug(tags[1].text)
size = tags[1].text
self.__log__.debug(tags[0].text)
size = tags[0].text
except IndexError:
size = "Nicht gegeben"
self.__log__.debug("Quadratmeter nicht angegeben")
Expand Down
20 changes: 17 additions & 3 deletions flathunter/crawl_immobilienscout.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import requests
import re
import datetime
from bs4 import BeautifulSoup

from flathunter.abstract_crawler import Crawler

class CrawlImmobilienscout(Crawler):
Expand Down Expand Up @@ -47,12 +49,24 @@ def get_results(self, search_url, max_pages=None):
entries.extend(cur_entry)
return entries

def get_page(self, search_url, page_no):
resp = requests.get(search_url.format(page_no))
def get_soup_from_url(self, url):
resp = requests.get(url)
if resp.status_code != 200:
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
return BeautifulSoup(resp.content, 'html.parser')

def get_page(self, search_url, page_no):
return self.get_soup_from_url(search_url.format(page_no))

def get_expose_details(self, expose):
soup = self.get_soup_from_url(expose['url'])
date = soup.find('dd', { "class": "is24qa-bezugsfrei-ab" })
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
if date is not None:
if not re.match(r'.*sofort.*', date.text):
expose['from'] = date.text.strip()
return expose

def extract_data(self, soup):
entries = list()

Expand Down Expand Up @@ -98,7 +112,7 @@ def extract_data(self, soup):
'title': title_el.text.strip().replace('NEU', ''),
'price': attr_els[0].text.strip().split(' ')[0].strip(),
'size': attr_els[1].text.strip().split(' ')[0].strip() + " qm",
'rooms': attr_els[2].text.strip().split(' ')[0].strip() + " Zi.",
'rooms': attr_els[2].text.strip().split(' ')[0].strip(),
'address': address,
'crawler': self.get_name()
}
Expand Down
24 changes: 24 additions & 0 deletions flathunter/crawl_immowelt.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import requests
import re
import datetime
from bs4 import BeautifulSoup

from flathunter.abstract_crawler import Crawler

class CrawlImmowelt(Crawler):
Expand All @@ -28,6 +30,28 @@ def get_page(self, search_url):
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
return BeautifulSoup(resp.content, 'html.parser')

def get_expose_details(self, expose):
soup = self.get_page(expose['url'])
immo_div = soup.find("div", { "id": "divImmobilie" })
if immo_div is not None:
details = immo_div.find_all("div", { "class": "clear" })
for detail in details:
if detail.find("div", { "class": "iw_left" }) is None:
continue
if detail.find("div", { "class": "iw_left" }).text.strip() == 'Die Wohnung':
description_element = detail.find("div", { "class": "iw_right" })
if description_element is None or description_element.find("p") is None:
continue
description = description_element.find("p").text
if re.match(r'.*sofort.*', description, re.MULTILINE|re.DOTALL|re.IGNORECASE):
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
date_string = re.match(r'.*(\d{2}.\d{2}.\d{4}).*', description, re.MULTILINE|re.DOTALL)
if date_string is not None:
expose['from'] = date_string[1]
if 'from' not in expose:
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
return expose

def extract_data(self, soup):
entries = list()
soup = soup.find(id="listItemWrapperFixed")
Expand Down
12 changes: 9 additions & 3 deletions flathunter/crawl_wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,26 @@ def extract_data(self, soup):
numbers_row = row.find("div", { "class": "middle" })
price = numbers_row.find("div", { "class": "col-xs-3" }).text.strip()
rooms = re.findall(r'\d Zimmer', details_array[0])[0][:1]
date = re.findall(r'\d{2}.\d{2}.\d{4}', numbers_row.find("div", { "class": "text-center" }).text)[0]
dates = re.findall(r'\d{2}.\d{2}.\d{4}', numbers_row.find("div", { "class": "text-center" }).text)
size = re.findall(r'\d{2,4}\sm²', numbers_row.find("div", { "class": "text-right" }).text)[0]

details = {
'id': int(url.split('.')[-2]),
'image': image,
'url': url,
'title': "%s ab dem %s" % (title, date),
'title': "%s ab dem %s" % (title, dates[0]),
'price': price,
'size': size,
'rooms': rooms + " Zi.",
'rooms': rooms,
'address': url,
'crawler': self.get_name()
}
if len(dates) == 2:
details['from'] = dates[0]
details['to'] = dates[1]
elif len(dates) == 1:
details['from'] = dates[0]

entries.append(details)

self.__log__.debug('extracted: ' + str(entries))
Expand Down
11 changes: 11 additions & 0 deletions flathunter/default_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ def process_expose(self, expose):
break
return expose

class CrawlExposeDetails(Processor):

def __init__(self, config):
self.config = config

def process_expose(self, expose):
for searcher in self.config.searchers():
if re.search(searcher.URL_PATTERN, expose['url']):
expose = searcher.get_expose_details(expose)
return expose

class LambdaProcessor(Processor):

def __init__(self, config, func):
Expand Down
18 changes: 7 additions & 11 deletions flathunter/googlecloud_idmaintainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,27 @@ def mark_processed(self, expose_id):
self.__log__.debug('mark_processed(' + str(expose_id) + ')')
self.db.collection(u'processed').document(str(expose_id)).set({ u'id': expose_id })

def is_processed(self, expose_id):
self.__log__.debug('is_processed(' + str(expose_id) + ')')
doc = self.db.collection(u'processed').document(str(expose_id))
return doc.get().exists

def save_expose(self, expose):
record = expose.copy()
record.update({ 'created_at': datetime.datetime.now(), 'created_sort': (0 - datetime.datetime.now().timestamp()) })
self.db.collection(u'exposes').document(str(expose[u'id'])).set(record)

def get_exposes_since(self, min_datetime):
res = []
for doc in self.db.collection(u'exposes').order_by('created_sort').stream():
for doc in self.db.collection(u'exposes').order_by('created_sort').limit(100).stream():
if doc.to_dict()[u'created_at'] < min_datetime:
break
res.append(doc.to_dict())
return res

def get_recent_exposes(self, count, filter=None):
res = []
for doc in self.db.collection(u'exposes').order_by('created_sort').stream():
for doc in self.db.collection(u'exposes').order_by('created_sort').limit(100).stream():
expose = doc.to_dict()
if filter is None or filter.is_interesting_expose(expose):
res.append(expose)
Expand Down Expand Up @@ -66,15 +71,6 @@ def get_user_filters(self):
res.append((int(doc.id), settings['filters']))
return res

def get(self):
res = []
for doc in self.db.collection(u'processed').stream():
res.append(doc.to_dict()[u'id'])

self.__log__.info('already processed: ' + str(len(res)))
self.__log__.debug(str(res))
return res

def get_last_run_time(self):
for doc in self.db.collection(u'executions').order_by(u'timestamp', direction=firestore.Query.DESCENDING).limit(1).stream():
return doc.to_dict()[u'timestamp']
Expand Down
25 changes: 8 additions & 17 deletions flathunter/idmaintainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@ class AlreadySeenFilter:

def __init__(self, id_watch):
self.id_watch = id_watch
self.processed = self.id_watch.get()

def is_interesting(self, expose):
if expose['id'] not in self.processed:
if not self.id_watch.is_processed(expose['id']):
self.id_watch.mark_processed(expose['id'])
self.processed.append(expose['id'])
return True
return False

Expand Down Expand Up @@ -59,6 +57,13 @@ def get_connection(self):
raise e
return connection

def is_processed(self, expose_id):
self.__log__.debug('is_processed(' + str(expose_id) + ')')
cur = self.get_connection().cursor()
cur.execute('SELECT id FROM processed WHERE id = ?', (expose_id,))
row = cur.fetchone()
return (row is not None)

def mark_processed(self, expose_id):
self.__log__.debug('mark_processed(' + str(expose_id) + ')')
cur = self.get_connection().cursor()
Expand Down Expand Up @@ -111,20 +116,6 @@ def get_user_filters(self):
res.append((row[0], json.loads(row[1])['filters']))
return res

def get(self):
res = []
cur = self.get_connection().cursor()
cur.execute("SELECT * FROM processed ORDER BY 1")
while True:
row = cur.fetchone()
if row == None:
break
res.append(row[0])

self.__log__.info('already processed: ' + str(len(res)))
self.__log__.debug(str(res))
return res

def get_last_run_time(self):
cur = self.get_connection().cursor()
cur.execute("SELECT * FROM executions ORDER BY timestamp DESC LIMIT 1")
Expand Down
7 changes: 6 additions & 1 deletion flathunter/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from flathunter.default_processors import AddressResolver
from flathunter.default_processors import Filter
from flathunter.default_processors import LambdaProcessor
from flathunter.default_processors import CrawlExposeDetails
from flathunter.sender_telegram import SenderTelegram
from flathunter.gmaps_duration_processor import GMapsDurationProcessor
from flathunter.idmaintainer import SaveAllExposesProcessor
Expand All @@ -30,6 +31,10 @@ def calculate_durations(self):
self.processors.append(GMapsDurationProcessor(self.config))
return self

def crawl_expose_details(self):
self.processors.append(CrawlExposeDetails(self.config))
return self

def map(self, func):
self.processors.append(LambdaProcessor(self.config, func))
return self
Expand All @@ -55,4 +60,4 @@ def process(self, exposes):

@staticmethod
def builder(config):
return ProcessorChainBuilder(config)
return ProcessorChainBuilder(config)
4 changes: 2 additions & 2 deletions flathunter/web/templates/exposes.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<div class="exposes">
{% for expose in exposes %}
<div class="expose">
<p>{{ expose['price'] }}, {{expose['rooms']}} rooms, {{expose['size']}}</p>
<p>{{ expose['price'] }}, {{expose['rooms']}} rooms, {{expose['size']}} from {{expose['from']}}</p>
<a href="{{ expose['url'] }}" target="_blank">
{% if expose['image'] %}
<img src="{{ expose['image'] }}">
Expand All @@ -12,4 +12,4 @@
<h3><a href="{{ expose['url'] }}" target="_blank">{{ expose['title'] }}</a></h3>
</div>
{% endfor %}
</div>
</div>
5 changes: 3 additions & 2 deletions flathunter/web_hunter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ def hunt_flats(self):
.build()

processor_chain = ProcessorChain.builder(self.config) \
.save_all_exposes(self.id_watch) \
.apply_filter(filter) \
.crawl_expose_details() \
.save_all_exposes(self.id_watch) \
.resolve_addresses() \
.calculate_durations() \
.build()
Expand Down Expand Up @@ -44,4 +45,4 @@ def set_filters_for_user(self, user_id, filters):
self.id_watch.set_filters_for_user(user_id, filters)

def get_filters_for_user(self, user_id):
return self.id_watch.get_filters_for_user(user_id)
return self.id_watch.get_filters_for_user(user_id)
Loading

0 comments on commit b2972b8

Please sign in to comment.