Skip to content

Commit

Permalink
Merge pull request NodyHub#11 from codders/feat/processor-pipelines
Browse files Browse the repository at this point in the history
Add processing pipelines
  • Loading branch information
codders authored Jun 9, 2020
2 parents 3ca75c3 + 959fe66 commit 66101c1
Show file tree
Hide file tree
Showing 17 changed files with 408 additions and 155 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Flask = "*"
Flask-API = "*"
firebase-admin = "*"
mock-firestore = "*"
pytest-mock = "*"
pytest = "*"

[dev-packages]
Expand Down
7 changes: 1 addition & 6 deletions flathunter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
import logging
import time
import yaml
from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
from flathunter.crawl_immobilienscout import CrawlImmobilienscout
from flathunter.crawl_wggesucht import CrawlWgGesucht
from flathunter.crawl_immowelt import CrawlImmowelt
from flathunter.idmaintainer import IdMaintainer
from flathunter.hunter import Hunter
from flathunter.config import Config
Expand Down Expand Up @@ -39,10 +35,9 @@


def launch_flat_hunt(config):
searchers = [CrawlImmobilienscout(), CrawlWgGesucht(), CrawlEbayKleinanzeigen(), CrawlImmowelt()]
id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__)))

hunter = Hunter(config, searchers, id_watch)
hunter = Hunter(config, id_watch)
hunter.hunt_flats()

while config.get('loop', dict()).get('active', False):
Expand Down
4 changes: 4 additions & 0 deletions flathunter/abstract_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class Processor:

def process_exposes(self, exposes):
return map(lambda e: self.process_expose(e), exposes)
13 changes: 13 additions & 0 deletions flathunter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
import yaml
import logging

from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
from flathunter.crawl_immobilienscout import CrawlImmobilienscout
from flathunter.crawl_wggesucht import CrawlWgGesucht
from flathunter.crawl_immowelt import CrawlImmowelt
from flathunter.filter import Filter

class Config:

__log__ = logging.getLogger(__name__)
__searchers__ = [CrawlImmobilienscout(), CrawlWgGesucht(), CrawlEbayKleinanzeigen(), CrawlImmowelt()]

def __init__(self, filename=None, string=None):
if string is not None:
Expand All @@ -27,6 +32,14 @@ def __getitem__(self, value):
def get(self, key, value=None):
return self.config.get(key, value)

@staticmethod
def set_searchers(searchers):
Config.__searchers__ = searchers

@staticmethod
def searchers():
return Config.__searchers__

def get_filter(self):
builder = Filter.builder()
if "excluded_titles" in self.config:
Expand Down
37 changes: 37 additions & 0 deletions flathunter/default_processors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import re
import logging
from flathunter.abstract_processor import Processor

class Filter(Processor):

def __init__(self, config, filter):
self.config = config
self.filter = filter

def process_exposes(self, exposes):
return self.filter.filter(exposes)

class AddressResolver(Processor):
__log__ = logging.getLogger(__name__)

def __init__(self, config):
self.config = config

def process_expose(self, expose):
if expose['address'].startswith('http'):
url = expose['address']
for searcher in self.config.searchers():
if re.search(searcher.URL_PATTERN, url):
expose['address'] = searcher.load_address(url)
self.__log__.debug("Loaded address %s for url %s" % (expose['address'], url))
break
return expose

class LambdaProcessor(Processor):

def __init__(self, config, func):
self.func = func

def process_expose(self, expose):
res = self.func(expose)
return expose
53 changes: 29 additions & 24 deletions flathunter/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,37 +103,42 @@ def is_interesting(self, expose):
return True
return False

class FilterBuilder:

def __init__(self):
self.filters = []
class PredicateFilter:

def title_filter(self, filtered_titles):
self.filters.append(TitleFilter(filtered_titles))
return self

def min_price_filter(self, min_price):
self.filters.append(MinPriceFilter(min_price))
return self
def __init__(self, predicate):
self.predicate = predicate

def max_price_filter(self, max_price):
self.filters.append(MaxPriceFilter(max_price))
return self
def is_interesting(self, expose):
return self.predicate(expose)

def min_size_filter(self, min_size):
self.filters.append(MinSizeFilter(min_size))
return self
class FilterBuilder:

def max_size_filter(self, max_size):
self.filters.append(MaxSizeFilter(max_size))
return self
def __init__(self):
self.filters = []

def min_rooms_filter(self, min_rooms):
self.filters.append(MinRoomsFilter(min_rooms))
def read_config(self, config):
if "excluded_titles" in config:
self.filters.append(TitleFilter(config["excluded_titles"]))
if "filters" in config:
filters_config = config["filters"]
if "excluded_titles" in filters_config:
self.filters.append(TitleFilter(filters_config["excluded_titles"]))
if "min_price" in filters_config:
self.filters.append(MinPriceFilter(filters_config["min_price"]))
if "max_price" in filters_config:
self.filters.append(MaxPriceFilter(filters_config["max_price"]))
if "min_size" in filters_config:
self.filters.append(MinSizeFilter(filters_config["min_size"]))
if "max_size" in filters_config:
self.filters.append(MaxSizeFilter(filters_config["max_size"]))
if "min_rooms" in filters_config:
self.filters.append(MinRoomsFilter(filters_config["min_rooms"]))
if "max_rooms" in filters_config:
self.filters.append(MaxRoomsFilter(filters_config["max_rooms"]))
return self

def max_rooms_filter(self, max_rooms):
self.filters.append(MaxRoomsFilter(max_rooms))
def predicate_filter(self, predicate):
self.filters.append(PredicateFilter(predicate))
return self

def build(self):
Expand Down
79 changes: 79 additions & 0 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
import datetime
import time
import urllib
import requests

from flathunter.abstract_processor import Processor

class GMapsDurationProcessor(Processor):

GM_MODE_TRANSIT = 'transit'
GM_MODE_BICYCLE = 'bicycling'
GM_MODE_DRIVING = 'driving'

__log__ = logging.getLogger(__name__)

def __init__(self, config):
self.config = config

def process_expose(self, expose):
expose['durations'] = self.get_formatted_durations(expose['address']).strip()
return expose

def get_formatted_durations(self, address):
out = ""
for duration in self.config.get('durations', list()):
if 'destination' in duration and 'name' in duration:
dest = duration.get('destination')
name = duration.get('name')
for mode in duration.get('modes', list()):
if 'gm_id' in mode and 'title' in mode and 'key' in self.config.get('google_maps_api', dict()):
duration = self.get_gmaps_distance(address, dest, mode['gm_id'])
out += "> %s (%s): %s\n" % (name, mode['title'], duration)

return out.strip()

def get_gmaps_distance(self, address, dest, mode):
# get timestamp for next monday at 9:00:00 o'clock
now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
next_monday = now + datetime.timedelta(days=(7 - now.weekday()))
arrival_time = str(int(time.mktime(next_monday.timetuple())))

# decode from unicode and url encode addresses
address = urllib.parse.quote_plus(address.strip().encode('utf8'))
dest = urllib.parse.quote_plus(dest.strip().encode('utf8'))
self.__log__.debug("Got address: %s" % address)

# get google maps config stuff
base_url = self.config.get('google_maps_api', dict()).get('url')
gm_key = self.config.get('google_maps_api', dict()).get('key')

if not gm_key and mode != self.GM_MODE_DRIVING:
self.__log__.warning("No Google Maps API key configured and without using a mode different from "
"'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
mode = 'driving'
base_url = base_url.replace('&key={key}', '')

# retrieve the result
url = base_url.format(dest=dest, mode=mode, origin=address, key=gm_key, arrival=arrival_time)
result = requests.get(url).json()
if result['status'] != 'OK':
self.__log__.error("Failed retrieving distance to address %s: " % address, result)
return None

# get the fastest route
distances = dict()
for row in result['rows']:
for element in row['elements']:
if 'status' in element and element['status'] != 'OK':
self.__log__.warning("For address %s we got the status message: %s" % (address, element['status']))
self.__log__.debug("We got this result: %s" % repr(result))
continue
self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
% (element['distance']['text'], element['duration']['text'],
element['duration']['value'])
)
distances[element['duration']['value']] = '%s (%s)' % \
(element['duration']['text'], element['distance']['text'])
return distances[min(distances.keys())] if distances else None
Loading

0 comments on commit 66101c1

Please sign in to comment.