NodyHub · nspo · Oct 12, 2019 · Oct 12, 2019 · Oct 12, 2019 · Oct 12, 2019
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -8,10 +8,16 @@ loop:
 # Currently supported services: www.immobilienscout24.de and
 # www.wg-gesucht.de. List the URLs in the following format:
 # urls:
-# 	- https://www.immobilienscout24.de/Suche/...
-# 	- https://www.wg-gesucht.de/...
+# 	- "https://www.immobilienscout24.de/Suche/..."
+# 	- "https://www.wg-gesucht.de/..."
 urls:
 
+# There are often city districts in the address which
+# Google Maps does not like. Use this blacklist to remove
+# districts from the search.
+blacklist:
+  - Innenstadt
+
 # If an expose includes an address, the bot is capable of
 # displaying the distance and time to travel (duration) to
 # some configured other addresses, for specific kinds of
@@ -22,28 +28,29 @@ urls:
 # 	- "bicyle"
 #	- "transit" (public transport)
 #	- "driving"
+#   - "walking"
 # 
 # The example configuration below includes a place for
 # "John", located at the main train station of munich.
 # Two kinds of travel (bicycle and transit) are requested,
 # each with a different label. Furthermore a place for
 # "Jane" is included, located at the given destination and
 # with the same kinds of travel.
-durations:
-    - name: John
-      destination: Hauptbahnhof, München
-      modes: 
-          - gm_id: transit
-            title: "Öff."
-          - gm_id: bicycle
-            title: "Rad"
-    - name: Jane
-      destination: Karlsplatz, München
-      modes: 
-          - gm_id: transit
-            title: "Öff."
-          - gm_id: driving
-            title: "Auto"
+#durations:
+#    - name: John
+#      destination: Hauptbahnhof, München
+#      modes:
+#          - gm_id: transit
+#            title: "Öff."
+#          - gm_id: bicycle
+#            title: "Rad"
+#    - name: Jane
+#      destination: Karlsplatz, München
+#      modes:
+#          - gm_id: transit
+#            title: "Öff."
+#          - gm_id: driving
+#            title: "Auto"
 
 # Multiline message (yes, the | is supposed to be there), 
 # to format the message received from the Telegram bot. 
@@ -54,15 +61,18 @@ durations:
 #	- {price}: Price for the flat
 # 	- {durations}: Durations calculated by GMaps, see above
 #	- {url}: URL to the expose
+#	- {address}: address of the flat
+#	- {date}: possible date of move
 message: |
-    {title}
+    {title} (ab {date})
     Zimmer: {rooms}
     Größe: {size}
     Preis: {price}
-    Anfahrt:
-    {durations}
+    Adresse: {address}
 
     {url}
+#    Anfahrt:
+#    {durations}
 
 # Calculating durations requires access to the Google Maps API. 
 # Below you can configure the URL to access the API, with placeholders.
@@ -92,3 +102,13 @@ google_maps_api:
 telegram:
     bot_token: 
     receiver_ids:
+
+# It is possible to filter entries by date of possible move. Three filters are available:
+# a minimum date, a maximum date and a blacklist. The blacklist is useful for dates
+# which cannot be parsed (e.g. "sofort").
+#date_filter:
+#    date_min: 2019-12-01
+#    date_max: 2020-01-01
+#    blacklist_phrases:
+#        - "sofort"
+date_filter:
diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py
@@ -12,9 +12,9 @@ def __init__(self):
     def get_results(self, search_url):
         # convert to paged URL
         if '/P-' in search_url:
-            search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-%i", search_url)
+            search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-[pageno]", search_url)
         else:
-            search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-%i/", search_url)
+            search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-[pageno]/", search_url)
         self.__log__.debug("Got search URL %s" % search_url)
 
         # load first page to get number of entries
@@ -29,16 +29,28 @@ def get_results(self, search_url):
         entries = self.extract_data(soup)
 
         # iterate over all remaining pages
-        while len(entries) < no_of_results:
+        num_empty_pages = 0
+        num_entries = len(entries)
+        while num_entries < no_of_results and num_empty_pages < 5:
             self.__log__.debug('Next Page')
             page_no += 1
             soup = self.get_page(search_url, page_no)
-            entries.extend(self.extract_data(soup))
+            new_entries = self.extract_data(soup)
+            num_entries += len(new_entries)
+
+            if new_entries == 0:
+                num_empty_pages += 1
+
+            entries.extend(new_entries)
 
         return entries
 
     def get_page(self, search_url, page_no):
-        resp = requests.get(search_url % page_no)
+        url = search_url.replace("[pageno]", str(page_no), 1)
+        return self.get_generic_page(url)
+
+    def get_generic_page(self, url):
+        resp = requests.get(url)
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
         return BeautifulSoup(resp.content, 'html.parser')
@@ -68,3 +80,15 @@ def extract_data(self, soup):
 
         self.__log__.debug('extracted: ' + str(entries))
         return entries
+
+    def load_date(self, url):
+        # extract address from expose itself
+        soup = self.get_generic_page(url)
+
+        bezugsfrei_elements = soup.find_all(lambda e: e.has_attr("class") and "is24qa-bezugsfrei-ab" in e["class"])
+        bezugsfrei_date = "?"
+        if bezugsfrei_elements:
+            bezugsfrei_date = bezugsfrei_elements[0].text.strip()
+
+        return bezugsfrei_date
+
diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py
@@ -15,60 +15,69 @@ def get_results(self, search_url):
         self.__log__.debug("Got search URL %s" % search_url)
 
         # load first page
-        page_no = 0
-        soup = self.get_page(search_url, page_no)
-        no_of_pages = 0  # TODO get it from soup
-        self.__log__.info('Found pages: ' + str(no_of_pages))
+        soup = self.get_page(search_url)
+
+        # extract additional pages
+        page_urls = []
+        a_paginations = soup.find_all("a", class_="a-pagination")
+        for a_pagination in a_paginations:
+            # for each additional page
+            page_urls.append("https://www.wg-gesucht.de/" + a_pagination.get('href'))
+
+        self.__log__.info('Found pages: ' + str(len(page_urls)+1))
 
         # get data from first page
         entries = self.extract_data(soup)
         self.__log__.debug('Number of found entries: ' + str(len(entries)))
 
         # iterate over all remaining pages
-        while (page_no + 1) < no_of_pages:  # page_no starts with 0, no_of_pages with 1
-            page_no += 1
-            self.__log__.debug('Checking page %i' % page_no)
-            soup = self.get_page(search_url, page_no)
+        current_page_no = 2
+        for page_url in page_urls:
+            self.__log__.debug('Checking page %i' % current_page_no)
+            soup = self.get_page(page_url)
             entries.extend(self.extract_data(soup))
             self.__log__.debug('Number of found entries: ' + str(len(entries)))
+            current_page_no += 1
 
         return entries
 
-    def get_page(self, search_url, page_no):
-        resp = requests.get(search_url)  # TODO add page_no in url
+    def get_page(self, search_url):
+        # search_url must be specific page - cannot add page number manually
+        resp = requests.get(search_url)
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         entries = []
 
-        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--'))
+        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('liste-'))
         existingFindings = list(
-            filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))
+            filter(lambda e: e.has_attr('class') and not 'display-none' in e['class'], findings))
 
         baseurl = 'https://www.wg-gesucht.de/'
         for row in existingFindings:
-            url = baseurl + row['adid']  # u'wohnungen-in-Muenchen-Altstadt-Lehel.6038357.html'
-            id = int(url.split('.')[-2])
-            rooms = row.find(lambda e: e.has_attr('class') and 'ang_spalte_zimmer' in e['class']).text.strip()  # u'3'
-            price = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_miete' in e['class']).text.strip()  # u'433\u20ac'
-            size = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_groesse' in e['class']).text.strip()  # u'75m\xb2'
-            district = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_stadt' in e['class']).text.strip()  # u'Altstadt-Lehel'
-            date = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_freiab' in e['class']).text.strip()  # u'21.03.17'
+            infostring = row.find(
+                lambda e: e.name == "div" and e.has_attr('class') and 'list-details-panel-inner' in e[
+                    'class']).p.text.strip()
+            rooms = "1?"  # re.findall(r'\d[-]Zimmer[-]Wohnung', infostring)[0][:1]
+            date = re.findall(r'\d{2}.\d{2}.\d{4}', infostring)[0]
+            detail = row.find_all(lambda e: e.name == "a" and e.has_attr('class') and 'detailansicht' in e['class']);
+            title = detail[2].text.strip()
+            url = baseurl + detail[0]["href"]
+            size_price = detail[0].text.strip()
+            price = re.findall(r'\d{2,4}\s€', size_price)[0]
+            size = re.findall(r'\d{2,4}\sm²', size_price)[0]
 
             details = {
                 'id': int(url.split('.')[-2]),
                 'url': url,
-                'title': "Wohnung in %s ab dem %s" % (district, date),
+                'title': title,
                 'price': price,
                 'size': size,
                 'rooms': rooms + " Zi.",
-                'address': url
+                'address': url,
+                'date': date,
             }
             entries.append(details)
 
@@ -78,9 +87,10 @@ def extract_data(self, soup):
 
     def load_address(self, url):
         # extract address from expose itself
-        exposeHTML = requests.get(url).content
-        exposeSoup = BeautifulSoup(exposeHTML, 'html.parser')
-        address_raw = exposeSoup.find(lambda e: e.has_attr('onclick') and '#map_tab' in e['onclick']).text
-        address = address_raw.strip().split('\n')[0] + ", " + address_raw.strip().split('\n')[-1].strip()
-
+        r = requests.get(url)
+        flat = BeautifulSoup(r.content, 'lxml')
+        try:
+            address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
+        except:
+            address = "?"
         return address