From b36fa5ba2147a05a3d6a85abc4c9b67b34cb88ea Mon Sep 17 00:00:00 2001 From: Andy Chosak Date: Mon, 6 Nov 2023 16:42:36 -0500 Subject: [PATCH] Better handling of wpull exit status codes Currently the crawl management command always returns a zero exit code even if wpull has some kind of serious error. By default wpull returns a non-zero exit code for *any* failure, which is too sensitive - we don't want downstream processing to fail just because the crawler can't resolve the DNS of a link, for example. This change reintroduces the wpull exit status codes but only for errors that don't relate to network failures (DNS, connectivity, etc). --- crawler/management/commands/crawl.py | 2 +- crawler/wpull_plugin.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/crawler/management/commands/crawl.py b/crawler/management/commands/crawl.py index 51d6666..717a4d4 100644 --- a/crawler/management/commands/crawl.py +++ b/crawler/management/commands/crawl.py @@ -79,4 +79,4 @@ def command(start_url, db_filename, max_pages, depth, recreate, resume): # https://docs.djangoproject.com/en/3.2/topics/async/#async-safety os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" - app.run_sync() + return app.run_sync() diff --git a/crawler/wpull_plugin.py b/crawler/wpull_plugin.py index af85d08..2c932d3 100644 --- a/crawler/wpull_plugin.py +++ b/crawler/wpull_plugin.py @@ -9,6 +9,7 @@ from wpull.application.hook import Actions from wpull.application.plugin import PluginFunctions, WpullPlugin, hook +from wpull.errors import ExitStatus from wpull.network.connection import BaseConnection from wpull.pipeline.item import URLProperties from wpull.url import URLInfo @@ -271,3 +272,26 @@ def process_200_response(self, request, response): html = response.body.content().decode("utf-8") return Page.from_html(request.url, html, self.start_url.hostname) + + @hook(PluginFunctions.exit_status) + def exit_status(self, app_session, exit_code): + # If a non-zero exit code exists because of some kind of network error + # (DNS resolution, connection issue, etc.) we want to ignore it and + # instead return a zero error code. We expect to encounter some of + # these errors when we crawl, but we don't want the overall process to + # fail downstream processing. + # + # See list of wpull exit status codes here: + # https://github.com/ArchiveTeam/wpull/blob/v2.0.1/wpull/errors.py#L40-L63 + return ( + 0 + if exit_code + in ( + ExitStatus.network_failure, + ExitStatus.ssl_verification_error, + ExitStatus.authentication_failure, + ExitStatus.protocol_error, + ExitStatus.server_error, + ) + else exit_code + )