backporting UnicodeDecodeError protection

Signed-off-by: Samuel Progin <[email protected]>
1tayH · Aug 31, 2018 · c9cb167 · c9cb167
1 parent 1b5dc2e
commit c9cb167
Showing 1 changed file with 22 additions and 22 deletions.
diff --git a/noisy.py b/noisy.py
@@ -8,7 +8,6 @@
 import time
 
 import requests
-from urllib3.exceptions import LocationParseError
 
 try:                 # Python 2
     from urllib.parse import urljoin, urlparse
@@ -17,10 +16,11 @@
 
 try:                 # Python 2
     reload(sys)
-    sys.setdefaultencoding('utf-8')
+    sys.setdefaultencoding('latin-1')
 except NameError:    # Python 3
     pass
 
+
 class Crawler(object):
     def __init__(self):
         """
@@ -101,7 +101,10 @@ def _is_blacklisted(self, url):
         :param url: full URL
         :return: boolean indicating whether a URL is blacklisted or not
         """
-        return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
+        try:
+            return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
+        except UnicodeDecodeError:
+            return True
 
     def _should_accept_url(self, url):
         """
@@ -229,25 +232,22 @@ def crawl(self):
         self._start_time = datetime.datetime.now()
 
         while True:
-            url = random.choice(self._config["root_urls"])
-            try:
-                body = self._request(url).content
-                self._links = self._extract_urls(body, url)
-                logging.debug("found {} links".format(len(self._links)))
-                self._browse_from_links()
-
-            except requests.exceptions.RequestException:
-                logging.warn("Error connecting to root url: {}".format(url))
-
-            except MemoryError:
-                logging.warn("Error: content at url: {} is exhausting the memory".format(url))
-
-            except LocationParseError:
-                logging.warn("Error encountered during parsing of: {}".format(url))
-
-            except self.CrawlerTimedOut:
-                logging.info("Timeout has exceeded, exiting")
-                return
+            for url in self._config["root_urls"]:
+                try:
+                    body = self._request(url).content
+                    self._links = self._extract_urls(body, url)
+                    logging.debug("found {} links".format(len(self._links)))
+                    self._browse_from_links()
+
+                except requests.exceptions.RequestException:
+                    logging.warn("Error connecting to root url: {}".format(url))
+
+                except self.CrawlerTimedOut:
+                    logging.info("Timeout has exceeded, exiting")
+                    return
+
+            logging.debug("No more links were found")
+
 
 def main():
     parser = argparse.ArgumentParser()