diff --git a/README.md b/README.md index 4327367..7a660cb 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,7 @@ yarn build Create a Python virtual environment and install required packages: ``` -python3.8 -m venv venv +python3.6 -m venv venv source venv/bin/activate pip install -r requirements/base.txt ``` @@ -248,7 +248,7 @@ under the `/sample/src` subdirectory. To regenerate these files, first serve the sample website locally: ``` -python -m http.server -d ./sample/src +cd ./sample/src && python -m http.server ``` This starts the sample website running at http://localhost:8000. diff --git a/crawler/management/commands/crawl.py b/crawler/management/commands/crawl.py new file mode 100644 index 0000000..78705fb --- /dev/null +++ b/crawler/management/commands/crawl.py @@ -0,0 +1,84 @@ +import os +import os.path + +import djclick as click +from wpull.application.builder import Builder +from wpull.application.options import AppArgumentParser + +from crawler import wpull_plugin + + +@click.command() +@click.argument("start_url") +@click.argument("db_filename", type=click.Path()) +@click.option( + "--max-pages", type=int, help="Maximum number of pages to crawl", default=0 +) +@click.option("--depth", type=int, help="Maximum crawl depth", default=0) +@click.option( + "--recreate", + is_flag=True, + show_default=True, + default=False, + help="Overwrite SQLite database if it already exists", +) +@click.option("--resume", is_flag=True) +def command(start_url, db_filename, max_pages, depth, recreate, resume): + """Crawl a website to a SQLite database.""" + if os.path.exists(db_filename): + if not recreate and not resume: + raise click.ClickException( + f"File {db_filename} already exists, " + "use --recreate to recreate " + "or --resume to resume a previous crawl." + ) + + if recreate: + os.remove(db_filename) + + wpull_progress_filename = f"{db_filename}.wpull.db" + click.echo( + f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume." + ) + + if not resume and os.path.exists(wpull_progress_filename): + os.path.remove(wpull_progress_filename) + + arg_parser = AppArgumentParser() + args = arg_parser.parse_args( + [ + start_url, + "--quiet", + "--recursive", + "--delete-after", + "--no-robots", + "--wait=0.5", + "--random-wait", + "--dns-timeout=5", + "--connect-timeout=5", + "--read-timeout=30", + "--session-timeout=30", + "--span-hosts", + "--link-extractors=html", + "--follow-tags=a", + "--user-agent=CFPB website indexer", + "--no-check-certificate", + f"--level={depth}", + f"--plugin-script={wpull_plugin.__file__}", + f"--plugin-args={db_filename},{max_pages}", + f"--database={wpull_progress_filename}", + ] + ) + builder = Builder(args) + app = builder.build() + + # This is required due to the use of async code in wpull. Unfortunately + # wpull hooks aren't called in a way that allows us to wrap Django database + # calls with sync_to_async. This is only safe because we only download one + # URL at a time. + # https://docs.djangoproject.com/en/3.2/topics/async/#async-safety + os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" + + exit_status = app.run_sync() + click.echo(f"done, exiting with status {exit_status}") + return exit_status diff --git a/crawler/models.py b/crawler/models.py index bf74d1e..b30f958 100644 --- a/crawler/models.py +++ b/crawler/models.py @@ -1,4 +1,10 @@ +import lxml.etree +import lxml.html.soupparser +import re +from urllib import parse + from django.db import models +from django.utils import timezone from modelcluster.models import ClusterableModel from modelcluster.fields import ParentalManyToManyField @@ -35,6 +41,105 @@ class Page(Request, ClusterableModel): components = ParentalManyToManyField(Component, related_name="pages") links = ParentalManyToManyField(Link, related_name="links") + def __str__(self): + return self.url + + HTML_COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)") + HTML_EXTERNAL_SITE = re.compile("/external-site/") + HTML_WHITESPACE = re.compile(r"\s+") + + @classmethod + def from_html( + cls, + url, + html, + internal_link_host, + ): + try: + tree = lxml.html.fromstring(html) + except lxml.etree.ParserError: + # https://bugs.launchpad.net/lxml/+bug/1949271 + tree = lxml.html.soupparser.fromstring(html) + + title_tag = tree.find(".//title") + title = title_tag.text.strip() if title_tag is not None else None + language = tree.find(".").get("lang") + + if title is None: + return + + body = cls._get_cleaned_body_from_tree(tree) + + if body is not None: + text = cls.HTML_WHITESPACE.sub(" ", body.text_content()).strip() + else: + text = None + + page = Page( + timestamp=timezone.now(), + url=url, + title=title, + language=language, + html=html, + text=text, + ) + + if body is None: + return page + + hrefs = list( + set( + href + for element, attribute, href, pos in body.iterlinks() + if "a" == element.tag and "href" == attribute + ) + ) + + # Remove any external link URL wrapping. + for i, href in enumerate(hrefs): + parsed_href = parse.urlparse(href) + if not cls.HTML_EXTERNAL_SITE.match(parsed_href.path): + continue + + if parsed_href.netloc and internal_link_host != parsed_href.netloc: + continue + + ext_url = parse.parse_qs(parsed_href.query).get("ext_url") + if ext_url: + hrefs[i] = ext_url[0] + + page.links = [Link(href=href) for href in sorted(hrefs)] + + body_html = lxml.etree.tostring(body, encoding="unicode") + + class_names = set(cls.HTML_COMPONENT_SEARCH.findall(body_html)) + page.components = [ + Component(class_name=class_name) for class_name in sorted(class_names) + ] + + return page + + @staticmethod + def _get_cleaned_body_from_tree(tree): + """Extract page body without header, footer, images, or scripts.""" + body = tree.find("./body") + + if body is not None: + drop_element_selectors = [ + ".o-header", + ".o-footer", + ".skip-nav", + "img", + "script", + "style", + ] + + for drop_element_selector in drop_element_selectors: + for element in body.cssselect(drop_element_selector): + element.drop_tree() + + return body + class ErrorBase(Request): status_code = models.PositiveIntegerField(db_index=True) @@ -43,10 +148,24 @@ class ErrorBase(Request): class Meta(Request.Meta): abstract = True + def __str__(self): + s = self.url + + if self.referrer: + s += f" (from {self.referrer})" + + s += f" {self.status_code}" + + return s + class Error(ErrorBase): - pass + def __str__(self): + return super().__str__() + " !" class Redirect(ErrorBase): location = models.TextField(db_index=True) + + def __str__(self): + return super().__str__() + f" -> {self.location}" diff --git a/crawler/tests/__init__.py b/crawler/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/tests/test_models.py b/crawler/tests/test_models.py new file mode 100644 index 0000000..c6d1890 --- /dev/null +++ b/crawler/tests/test_models.py @@ -0,0 +1,110 @@ +from operator import attrgetter +from unittest.mock import patch + +import lxml.etree + +from django.test import SimpleTestCase + +from crawler.models import Error, Page, Redirect + + +class PageTests(SimpleTestCase): + def test_from_html_no_title_returns_none(self): + self.assertIsNone( + Page.from_html( + "https://example.com/", + "This page has no title.", + "example.com", + ) + ) + + def check_from_html(self): + html = """ + +Test page + + + +
A regular link on the same domain.
+ + + + """.strip() + + page = Page.from_html("https://example.com/", html, "example.com") + self.assertEqual(str(page), "https://example.com/") + self.assertEqual(page.title, "Test page") + self.assertEqual(page.language, "en") + self.assertEqual(page.html, html) + self.assertEqual( + page.text, + ( + "Links " + "A regular link on the same domain. " + "An external link pointing to another domain " + "An external link missing its target " + "A link on another domain that also uses /external-site/" + ), + ) + self.assertCountEqual( + page.components.values_list("class_name", flat=True), + ["a-external-link", "m-links"], + ) + self.assertCountEqual( + page.links.values_list("href", flat=True), + [ + "/external-site/", + "/page/", + "https://example.org/", + "https://example.org/external-site/", + ], + ) + + def test_from_html(self): + self.check_from_html() + + def test_from_html_etree_fallback_parser(self): + with patch( + "lxml.html.fromstring", + side_effect=lxml.etree.ParserError("testing parser error"), + ): + self.check_from_html() + + def test_from_html_no_body(self): + html = 'Test page with no body</head></html>' + page = Page.from_html("https://example.com/", html, "example.com") + self.assertEqual(str(page), "https://example.com/") + self.assertEqual(page.title, "Test page with no body") + self.assertEqual(page.language, "en") + self.assertEqual(page.html, html) + self.assertIsNone(page.text) + + +class ErrorTests(SimpleTestCase): + def test_error_str(self): + self.assertEqual( + str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !" + ) + + def test_error_str_with_referrer(self): + self.assertEqual( + str( + Redirect( + url="/redirect/", + referrer="/source/", + status_code=301, + location="/destination/", + ) + ), + "/redirect/ (from /source/) 301 -> /destination/", + ) diff --git a/crawler/wpull_plugin.py b/crawler/wpull_plugin.py new file mode 100644 index 0000000..af85d08 --- /dev/null +++ b/crawler/wpull_plugin.py @@ -0,0 +1,273 @@ +import asyncio +import logging +import re +from urllib import parse + +from django.core.management import call_command +from django.db import connections +from django.utils import timezone + +from wpull.application.hook import Actions +from wpull.application.plugin import PluginFunctions, WpullPlugin, hook +from wpull.network.connection import BaseConnection +from wpull.pipeline.item import URLProperties +from wpull.url import URLInfo + +from crawler.models import Error, Page, Redirect +from crawler.writer import DatabaseWriter + + +logger = logging.getLogger("crawler") + + +SKIP_URLS = list( + map( + re.compile, + [ + r"^https://www.facebook.com/dialog/share\?.*", + r"^https://twitter.com/intent/tweet\?.*", + r"^https://www.linkedin.com/shareArticle\?.*", + ], + ) +) + +HEAD_URLS = list(map(re.compile, [r"https://files.consumerfinance.gov/.*"])) + + +def patch_wpull_connection(): + """Use wait_timeout instead of close_timeout for readline.""" + + @asyncio.coroutine + def readline(self): + data = yield from self.run_network_operation( + self.reader.readline(), wait_timeout=self._timeout, name="Readline" + ) + return data + + BaseConnection.readline = readline + + +class DatabaseWritingPlugin(WpullPlugin): + def activate(self): + super().activate() + + patch_wpull_connection() + + self.start_url = URLInfo.parse(self.app_session.args.urls[0]) + self.db_filename, self.max_pages = self.app_session.args.plugin_args.rsplit( + ",", maxsplit=1 + ) + self.max_pages = int(self.max_pages) + + self.db_writer = self.init_db() + self.accepted_urls = [] + self.requested_urls = [] + + def deactivate(self): + super().deactivate() + self.db_writer.analyze() + + def init_db(self): + db_alias = "warc_to_db" + + connections.databases[db_alias] = { + "ENGINE": "django.db.backends.sqlite3", + "NAME": self.db_filename, + } + + call_command("migrate", database=db_alias, app_label="crawler", run_syncdb=True) + + return DatabaseWriter(db_alias) + + @property + def at_max_pages(self): + return self.max_pages and len(self.requested_urls) >= self.max_pages + + @hook(PluginFunctions.accept_url) + def accept_url(self, item_session, verdict, reasons): + # If upstream logic rejected this URL, let the rejection stand. + if not verdict: + return False + + # If we've already crawled enough pages, stop. + if self.at_max_pages: + return False + + request = item_session.url_record + + # Don't request pages more than once. + if request.url in self.requested_urls: + return False + + # Always skip certain URLs. + if SKIP_URLS and any(skip_url.match(request.url) for skip_url in SKIP_URLS): + return False + + # We want to crawl links to different domains to test their validity. + # But once we've done that, we don't want to keep crawling there. + # Therefore, don't crawl links that start on different domains. + if ( + request.parent_url_info.hostname_with_port + != self.start_url.hostname_with_port + ): + return False + + # Use HEAD requests to speed up the crawl for certain external domains. + # We can't do this everywhere because other sites may respond to HEAD + # requests in inconvenient ways. This avoids the need to fully download + # external responses. + if HEAD_URLS and any(head_url.match(request.url) for head_url in HEAD_URLS): + item_session.request.method = "HEAD" + + # If we're crawling on the start domain, apply additional rejections. + elif request.url_info.hostname_with_port == self.start_url.hostname_with_port: + # Don't crawl URLs that look like filenames. + if "." in request.url_info.path: + return False + + qs = parse.parse_qs(request.url_info.query) + + if qs: + # Don't crawl external link URLs directly. + # Instead crawl to their ultimate destination. + if Page.HTML_EXTERNAL_SITE.match(request.url_info.path): + ext_urls = qs.get("ext_url") + if ext_urls: + # Add the external URL to the list to be crawled. + ext_url = ext_urls[0] + + url_properties = URLProperties() + url_properties.level = request.level + url_properties.inline_level = request.inline_level + url_properties.parent_url = request.parent_url + url_properties.root_url = request.root_url + + item_session.app_session.factory["URLTable"].remove_many( + [ext_url] + ) + item_session.add_url(ext_url, url_properites=url_properties) + return False + + # For all other URLs, limit querystrings that get crawled. + # Only crawl pages that only have the "page" parameter. + elif list(qs.keys()) != ["page"]: + return False + + if request.url not in self.accepted_urls: + logger.info(f"Crawling {request.url}") + self.accepted_urls.append(request.url) + + return True + + @hook(PluginFunctions.handle_error) + def handle_error(self, item_session, error): + if item_session.request.url in self.requested_urls: + logger.debug(f"Already logged error for {item_session.request.url}") + else: + logger.debug(error) + self.db_writer.write( + Error( + timestamp=timezone.now(), + url=item_session.request.url, + status_code=0, + referrer=item_session.request.fields.get("Referer"), + ) + ) + + self.requested_urls.append(item_session.request.url) + + @hook(PluginFunctions.handle_pre_response) + def handle_pre_response(self, item_session): + # Our accept_url handler converts certain external requests from GET to + # HEAD. The wpull response body handler seems to assume that HEAD + # request responses will never have Content-Length or Transfer-Encoding + # headers, which doesn't seem to be the case in practice: + # + # https://github.com/ArchiveTeam/wpull/blob/v2.0.1/wpull/protocol/http/stream.py#L441-L451 + # + # Therefore, we strip these headers out if they exist, since we don't + # need them for our purposes. Since this was an external request, we + # care only about the status code, not the response body. + if item_session.request.method == "HEAD": + item_session.response.fields.pop("Content-Length", None) + item_session.response.fields.pop("Transfer-Encoding", None) + + return Actions.NORMAL + + @hook(PluginFunctions.handle_response) + def handle_response(self, item_session): + request = item_session.request + response = item_session.response + status_code = response.status_code + timestamp = timezone.now() + + if request.url in self.requested_urls: + logger.debug(f"Already logged {request.url}") + item_session.skip() + return Actions.FINISH + else: + self.requested_urls.append(request.url) + + if status_code >= 300: + referrer = request.fields.get("Referer") + + if status_code < 400: + location = response.fields.get("Location") + location_parsed = parse.urlparse(location) + + self.db_writer.write( + Redirect( + timestamp=timestamp, + url=request.url, + status_code=status_code, + referrer=referrer, + location=location, + ) + ) + + # Don't follow redirects that don't point to the start domain. + if ( + location_parsed.hostname + and location_parsed.hostname != self.start_url.hostname + ) or ( + location_parsed.port and location_parsed.port != self.start_url.port + ): + logger.debug(f"Not following redirect to {location}") + item_session.skip() + return Actions.FINISH + else: + self.db_writer.write( + Error( + timestamp=timestamp, + url=request.url, + status_code=status_code, + referrer=referrer, + ) + ) + + return Actions.NORMAL + + # If this request was to an external domain and it responded with + # a normal status code, we don't care about recording it. + if request.url_info.hostname_with_port != self.start_url.hostname_with_port: + item_session.skip() + return Actions.FINISH + + page_record = self.process_200_response(request, response) + + if not page_record: + logger.debug(f"Unexpected response for {request.url}, skipping") + item_session.skip() + return Actions.FINISH + + self.db_writer.write(page_record) + return Actions.NORMAL + + def process_200_response(self, request, response): + content_type = response.fields.get("Content-Type") + + if not (content_type or "").startswith("text/html"): + return + + html = response.body.content().decode("utf-8") + return Page.from_html(request.url, html, self.start_url.hostname) diff --git a/crawler/writer.py b/crawler/writer.py index 1d15f29..caaadb7 100644 --- a/crawler/writer.py +++ b/crawler/writer.py @@ -1,8 +1,13 @@ +import logging + from django.db import connections from crawler.models import Component, Link, Page +logger = logging.getLogger("crawler") + + class DatabaseWriter: def __init__(self, db): self.db = db @@ -12,6 +17,7 @@ def write(self, instance): if isinstance(instance, Page): return self._write_page(instance) else: + logger.debug(f"Saving {instance}") instance.save(using=self.db) def _write_page(self, page): @@ -36,6 +42,7 @@ def _write_page(self, page): .values() ) + logger.debug(f"Saving {page}") page.save(using=self.db) def analyze(self): diff --git a/fabfile.py b/fabfile.py index e2b09d4..0d97059 100644 --- a/fabfile.py +++ b/fabfile.py @@ -22,7 +22,7 @@ SQLITE_BASENAME = f"sqlite-autoconf-{SQLITE_VERSION}" SQLITE_INSTALL_ROOT = f"{DEPLOY_ROOT}/{SQLITE_BASENAME}" -PYTHON_VERSION = "3.8.13" +PYTHON_VERSION = "3.6.15" PYTHON_BASENAME = f"Python-{PYTHON_VERSION}" PYTHON_INSTALL_ROOT = f"{DEPLOY_ROOT}/{PYTHON_BASENAME}" diff --git a/requirements/base.txt b/requirements/base.txt index 6573881..91679fe 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,12 +1,21 @@ -click==8.1.3 +beautifulsoup4==4.12.2 +click==8.0.4 cssselect==1.1.0 -Django==4.0.7 +Django==3.2.22 django-click==2.3.0 -django-debug-toolbar==3.4.0 -django-filter==22.1 -django-modelcluster==6.0 +django-debug-toolbar==3.2.4 +django-filter==21.1 +django-modelcluster==5.3 djangorestframework==3.13.1 djangorestframework-csv==2.1.1 lxml==4.9.1 warcio==1.7.4 -whitenoise==6.1.0 +whitenoise==5.3.0 +wpull==2.0.1 + +# wpull doesn't set upper bounds for some of its requirements, +# so we need to specify these manually: +# See https://github.com/ArchiveTeam/wpull/blob/v2.0.1/requirements.txt +html5lib==0.9999999 +sqlalchemy==1.0.12 +tornado==4.5.3 diff --git a/sample/src/index.html b/sample/src/index.html index 4cd4e03..9e3cf13 100644 --- a/sample/src/index.html +++ b/sample/src/index.html @@ -10,5 +10,13 @@ <h1>Sample homepage</h1> <p>This is sample content.</p> <p><a href="/child/">This is a link to a child page.</a></p> + <p><a href="https://example.com/">This is a link somewhere else.</a></p> + <p><a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F" data-pretty-href="https://example.org/">This is an obfuscated link somewhere else.</a></p> + <p><a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F" data-pretty-href="https://example.org/">This is another obfuscated link some + where else.</a></p> + <p><a href="./file.xlsx">This links to a file.</a></p> + <p><a href="https://example.com/file.xlsx">This links to a file somewhere else.</a></p> + <p><a href="/child/?page=2">This link has a page query string parameter.</a></p> <p><a href="/child/?foo=bar">This link has a non-page query string parameter.</a></p> + <p><a href="/child/?page=2&foo=bar">This link has multiple query string parameters.</a></p> </body> </html> diff --git a/settings.py b/settings.py index 1d71f47..b706559 100644 --- a/settings.py +++ b/settings.py @@ -1,13 +1,11 @@ """ Django settings for viewer project. -Generated by 'django-admin startproject' using Django 4.0.3. - For more information on this file, see -https://docs.djangoproject.com/en/4.0/topics/settings/ +https://docs.djangoproject.com/en/3.2/topics/settings/ For the full list of settings and their values, see -https://docs.djangoproject.com/en/4.0/ref/settings/ +https://docs.djangoproject.com/en/3.2/ref/settings/ """ import os import sys @@ -18,7 +16,7 @@ # Quick-start development settings - unsuitable for production -# See https://docs.djangoproject.com/en/4.0/howto/deployment/checklist/ +# See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = "django-insecure-a94cjadrz=y0o+c75138ro=gn3oq0*by)gs1cs88k$9+taepp(" @@ -73,7 +71,7 @@ # Database -# https://docs.djangoproject.com/en/4.0/ref/settings/#databases +# https://docs.djangoproject.com/en/3.2/ref/settings/#databases _sample_db_path = str(BASE_DIR / "sample" / "sample.sqlite3") _env_db_path = os.getenv("CRAWL_DATABASE") @@ -97,7 +95,7 @@ } # Internationalization -# https://docs.djangoproject.com/en/4.0/topics/i18n/ +# https://docs.djangoproject.com/en/3.2/topics/i18n/ LANGUAGE_CODE = "en-us" @@ -109,12 +107,12 @@ # Static files (CSS, JavaScript, Images) -# https://docs.djangoproject.com/en/4.0/howto/static-files/ +# https://docs.djangoproject.com/en/3.2/howto/static-files/ STATIC_URL = "static/" # Default primary key field type -# https://docs.djangoproject.com/en/4.0/ref/settings/#default-auto-field +# https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" @@ -138,3 +136,27 @@ "PAGE_SIZE": 25, "UNAUTHENTICATED_USER": None, } + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": " %(asctime)s.%(msecs)03d %(levelname)s %(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S", + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "default", + }, + }, + "loggers": { + "crawler": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + }, +} diff --git a/wsgi.py b/wsgi.py index 49cf873..d9bf8fc 100644 --- a/wsgi.py +++ b/wsgi.py @@ -4,7 +4,7 @@ It exposes the WSGI callable as a module-level variable named ``application``. For more information on this file, see -https://docs.djangoproject.com/en/4.0/howto/deployment/wsgi/ +https://docs.djangoproject.com/en/3.2/howto/deployment/wsgi/ """ import os