Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

General fixes for immoscout and wggesucht (Oct. 2019). Add date filter. #11

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 40 additions & 20 deletions config.yaml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ loop:
# Currently supported services: www.immobilienscout24.de and
# www.wg-gesucht.de. List the URLs in the following format:
# urls:
# - https://www.immobilienscout24.de/Suche/...
# - https://www.wg-gesucht.de/...
# - "https://www.immobilienscout24.de/Suche/..."
# - "https://www.wg-gesucht.de/..."
urls:

# There are often city districts in the address which
# Google Maps does not like. Use this blacklist to remove
# districts from the search.
blacklist:
- Innenstadt

# If an expose includes an address, the bot is capable of
# displaying the distance and time to travel (duration) to
# some configured other addresses, for specific kinds of
Expand All @@ -22,28 +28,29 @@ urls:
# - "bicyle"
# - "transit" (public transport)
# - "driving"
# - "walking"
#
# The example configuration below includes a place for
# "John", located at the main train station of munich.
# Two kinds of travel (bicycle and transit) are requested,
# each with a different label. Furthermore a place for
# "Jane" is included, located at the given destination and
# with the same kinds of travel.
durations:
- name: John
destination: Hauptbahnhof, München
modes:
- gm_id: transit
title: "Öff."
- gm_id: bicycle
title: "Rad"
- name: Jane
destination: Karlsplatz, München
modes:
- gm_id: transit
title: "Öff."
- gm_id: driving
title: "Auto"
#durations:
# - name: John
# destination: Hauptbahnhof, München
# modes:
# - gm_id: transit
# title: "Öff."
# - gm_id: bicycle
# title: "Rad"
# - name: Jane
# destination: Karlsplatz, München
# modes:
# - gm_id: transit
# title: "Öff."
# - gm_id: driving
# title: "Auto"

# Multiline message (yes, the | is supposed to be there),
# to format the message received from the Telegram bot.
Expand All @@ -54,15 +61,18 @@ durations:
# - {price}: Price for the flat
# - {durations}: Durations calculated by GMaps, see above
# - {url}: URL to the expose
# - {address}: address of the flat
# - {date}: possible date of move
message: |
{title}
{title} (ab {date})
Zimmer: {rooms}
Größe: {size}
Preis: {price}
Anfahrt:
{durations}
Adresse: {address}

{url}
# Anfahrt:
# {durations}

# Calculating durations requires access to the Google Maps API.
# Below you can configure the URL to access the API, with placeholders.
Expand Down Expand Up @@ -92,3 +102,13 @@ google_maps_api:
telegram:
bot_token:
receiver_ids:

# It is possible to filter entries by date of possible move. Three filters are available:
# a minimum date, a maximum date and a blacklist. The blacklist is useful for dates
# which cannot be parsed (e.g. "sofort").
#date_filter:
# date_min: 2019-12-01
# date_max: 2020-01-01
# blacklist_phrases:
# - "sofort"
date_filter:
34 changes: 29 additions & 5 deletions flathunter/crawl_immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ def __init__(self):
def get_results(self, search_url):
# convert to paged URL
if '/P-' in search_url:
search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-%i", search_url)
search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-[pageno]", search_url)
else:
search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-%i/", search_url)
search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-[pageno]/", search_url)
self.__log__.debug("Got search URL %s" % search_url)

# load first page to get number of entries
Expand All @@ -29,16 +29,28 @@ def get_results(self, search_url):
entries = self.extract_data(soup)

# iterate over all remaining pages
while len(entries) < no_of_results:
num_empty_pages = 0
num_entries = len(entries)
while num_entries < no_of_results and num_empty_pages < 5:
self.__log__.debug('Next Page')
page_no += 1
soup = self.get_page(search_url, page_no)
entries.extend(self.extract_data(soup))
new_entries = self.extract_data(soup)
num_entries += len(new_entries)

if new_entries == 0:
num_empty_pages += 1

entries.extend(new_entries)

return entries

def get_page(self, search_url, page_no):
resp = requests.get(search_url % page_no)
url = search_url.replace("[pageno]", str(page_no), 1)
return self.get_generic_page(url)

def get_generic_page(self, url):
resp = requests.get(url)
if resp.status_code != 200:
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
return BeautifulSoup(resp.content, 'html.parser')
Expand Down Expand Up @@ -68,3 +80,15 @@ def extract_data(self, soup):

self.__log__.debug('extracted: ' + str(entries))
return entries

def load_date(self, url):
# extract address from expose itself
soup = self.get_generic_page(url)

bezugsfrei_elements = soup.find_all(lambda e: e.has_attr("class") and "is24qa-bezugsfrei-ab" in e["class"])
bezugsfrei_date = "?"
if bezugsfrei_elements:
bezugsfrei_date = bezugsfrei_elements[0].text.strip()

return bezugsfrei_date

72 changes: 41 additions & 31 deletions flathunter/crawl_wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,60 +15,69 @@ def get_results(self, search_url):
self.__log__.debug("Got search URL %s" % search_url)

# load first page
page_no = 0
soup = self.get_page(search_url, page_no)
no_of_pages = 0 # TODO get it from soup
self.__log__.info('Found pages: ' + str(no_of_pages))
soup = self.get_page(search_url)

# extract additional pages
page_urls = []
a_paginations = soup.find_all("a", class_="a-pagination")
for a_pagination in a_paginations:
# for each additional page
page_urls.append("https://www.wg-gesucht.de/" + a_pagination.get('href'))

self.__log__.info('Found pages: ' + str(len(page_urls)+1))

# get data from first page
entries = self.extract_data(soup)
self.__log__.debug('Number of found entries: ' + str(len(entries)))

# iterate over all remaining pages
while (page_no + 1) < no_of_pages: # page_no starts with 0, no_of_pages with 1
page_no += 1
self.__log__.debug('Checking page %i' % page_no)
soup = self.get_page(search_url, page_no)
current_page_no = 2
for page_url in page_urls:
self.__log__.debug('Checking page %i' % current_page_no)
soup = self.get_page(page_url)
entries.extend(self.extract_data(soup))
self.__log__.debug('Number of found entries: ' + str(len(entries)))
current_page_no += 1

return entries

def get_page(self, search_url, page_no):
resp = requests.get(search_url) # TODO add page_no in url
def get_page(self, search_url):
# search_url must be specific page - cannot add page number manually
resp = requests.get(search_url)
if resp.status_code != 200:
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(resp.content, 'lxml')

def extract_data(self, soup):
entries = []

findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--'))
findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('liste-'))
existingFindings = list(
filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))
filter(lambda e: e.has_attr('class') and not 'display-none' in e['class'], findings))

baseurl = 'https://www.wg-gesucht.de/'
for row in existingFindings:
url = baseurl + row['adid'] # u'wohnungen-in-Muenchen-Altstadt-Lehel.6038357.html'
id = int(url.split('.')[-2])
rooms = row.find(lambda e: e.has_attr('class') and 'ang_spalte_zimmer' in e['class']).text.strip() # u'3'
price = row.find(
lambda e: e.has_attr('class') and 'ang_spalte_miete' in e['class']).text.strip() # u'433\u20ac'
size = row.find(
lambda e: e.has_attr('class') and 'ang_spalte_groesse' in e['class']).text.strip() # u'75m\xb2'
district = row.find(
lambda e: e.has_attr('class') and 'ang_spalte_stadt' in e['class']).text.strip() # u'Altstadt-Lehel'
date = row.find(
lambda e: e.has_attr('class') and 'ang_spalte_freiab' in e['class']).text.strip() # u'21.03.17'
infostring = row.find(
lambda e: e.name == "div" and e.has_attr('class') and 'list-details-panel-inner' in e[
'class']).p.text.strip()
rooms = "1?" # re.findall(r'\d[-]Zimmer[-]Wohnung', infostring)[0][:1]
date = re.findall(r'\d{2}.\d{2}.\d{4}', infostring)[0]
detail = row.find_all(lambda e: e.name == "a" and e.has_attr('class') and 'detailansicht' in e['class']);
title = detail[2].text.strip()
url = baseurl + detail[0]["href"]
size_price = detail[0].text.strip()
price = re.findall(r'\d{2,4}\s€', size_price)[0]
size = re.findall(r'\d{2,4}\sm²', size_price)[0]

details = {
'id': int(url.split('.')[-2]),
'url': url,
'title': "Wohnung in %s ab dem %s" % (district, date),
'title': title,
'price': price,
'size': size,
'rooms': rooms + " Zi.",
'address': url
'address': url,
'date': date,
}
entries.append(details)

Expand All @@ -78,9 +87,10 @@ def extract_data(self, soup):

def load_address(self, url):
# extract address from expose itself
exposeHTML = requests.get(url).content
exposeSoup = BeautifulSoup(exposeHTML, 'html.parser')
address_raw = exposeSoup.find(lambda e: e.has_attr('onclick') and '#map_tab' in e['onclick']).text
address = address_raw.strip().split('\n')[0] + ", " + address_raw.strip().split('\n')[-1].strip()

r = requests.get(url)
flat = BeautifulSoup(r.content, 'lxml')
try:
address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
except:
address = "?"
return address
Loading