Skip to content

Commit

Permalink
backporting UnicodeDecodeError protection
Browse files Browse the repository at this point in the history
Signed-off-by: Samuel Progin <[email protected]>
  • Loading branch information
Arduous committed Aug 31, 2018
1 parent 1b5dc2e commit c9cb167
Showing 1 changed file with 22 additions and 22 deletions.
44 changes: 22 additions & 22 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import time

import requests
from urllib3.exceptions import LocationParseError

try: # Python 2
from urllib.parse import urljoin, urlparse
Expand All @@ -17,10 +16,11 @@

try: # Python 2
reload(sys)
sys.setdefaultencoding('utf-8')
sys.setdefaultencoding('latin-1')
except NameError: # Python 3
pass


class Crawler(object):
def __init__(self):
"""
Expand Down Expand Up @@ -101,7 +101,10 @@ def _is_blacklisted(self, url):
:param url: full URL
:return: boolean indicating whether a URL is blacklisted or not
"""
return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
try:
return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
except UnicodeDecodeError:
return True

def _should_accept_url(self, url):
"""
Expand Down Expand Up @@ -229,25 +232,22 @@ def crawl(self):
self._start_time = datetime.datetime.now()

while True:
url = random.choice(self._config["root_urls"])
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))

except MemoryError:
logging.warn("Error: content at url: {} is exhausting the memory".format(url))

except LocationParseError:
logging.warn("Error encountered during parsing of: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return
for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return

logging.debug("No more links were found")


def main():
parser = argparse.ArgumentParser()
Expand Down

0 comments on commit c9cb167

Please sign in to comment.