diff --git a/README.md b/README.md
index 4327367..7a660cb 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ yarn build
Create a Python virtual environment and install required packages:
```
-python3.8 -m venv venv
+python3.6 -m venv venv
source venv/bin/activate
pip install -r requirements/base.txt
```
@@ -248,7 +248,7 @@ under the `/sample/src` subdirectory.
To regenerate these files, first serve the sample website locally:
```
-python -m http.server -d ./sample/src
+cd ./sample/src && python -m http.server
```
This starts the sample website running at http://localhost:8000.
diff --git a/crawler/management/commands/crawl.py b/crawler/management/commands/crawl.py
new file mode 100644
index 0000000..78705fb
--- /dev/null
+++ b/crawler/management/commands/crawl.py
@@ -0,0 +1,84 @@
+import os
+import os.path
+
+import djclick as click
+from wpull.application.builder import Builder
+from wpull.application.options import AppArgumentParser
+
+from crawler import wpull_plugin
+
+
+@click.command()
+@click.argument("start_url")
+@click.argument("db_filename", type=click.Path())
+@click.option(
+ "--max-pages", type=int, help="Maximum number of pages to crawl", default=0
+)
+@click.option("--depth", type=int, help="Maximum crawl depth", default=0)
+@click.option(
+ "--recreate",
+ is_flag=True,
+ show_default=True,
+ default=False,
+ help="Overwrite SQLite database if it already exists",
+)
+@click.option("--resume", is_flag=True)
+def command(start_url, db_filename, max_pages, depth, recreate, resume):
+ """Crawl a website to a SQLite database."""
+ if os.path.exists(db_filename):
+ if not recreate and not resume:
+ raise click.ClickException(
+ f"File {db_filename} already exists, "
+ "use --recreate to recreate "
+ "or --resume to resume a previous crawl."
+ )
+
+ if recreate:
+ os.remove(db_filename)
+
+ wpull_progress_filename = f"{db_filename}.wpull.db"
+ click.echo(
+ f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume."
+ )
+
+ if not resume and os.path.exists(wpull_progress_filename):
+ os.path.remove(wpull_progress_filename)
+
+ arg_parser = AppArgumentParser()
+ args = arg_parser.parse_args(
+ [
+ start_url,
+ "--quiet",
+ "--recursive",
+ "--delete-after",
+ "--no-robots",
+ "--wait=0.5",
+ "--random-wait",
+ "--dns-timeout=5",
+ "--connect-timeout=5",
+ "--read-timeout=30",
+ "--session-timeout=30",
+ "--span-hosts",
+ "--link-extractors=html",
+ "--follow-tags=a",
+ "--user-agent=CFPB website indexer",
+ "--no-check-certificate",
+ f"--level={depth}",
+ f"--plugin-script={wpull_plugin.__file__}",
+ f"--plugin-args={db_filename},{max_pages}",
+ f"--database={wpull_progress_filename}",
+ ]
+ )
+ builder = Builder(args)
+ app = builder.build()
+
+ # This is required due to the use of async code in wpull. Unfortunately
+ # wpull hooks aren't called in a way that allows us to wrap Django database
+ # calls with sync_to_async. This is only safe because we only download one
+ # URL at a time.
+ # https://docs.djangoproject.com/en/3.2/topics/async/#async-safety
+ os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
+
+ exit_status = app.run_sync()
+ click.echo(f"done, exiting with status {exit_status}")
+ return exit_status
diff --git a/crawler/models.py b/crawler/models.py
index bf74d1e..b30f958 100644
--- a/crawler/models.py
+++ b/crawler/models.py
@@ -1,4 +1,10 @@
+import lxml.etree
+import lxml.html.soupparser
+import re
+from urllib import parse
+
from django.db import models
+from django.utils import timezone
from modelcluster.models import ClusterableModel
from modelcluster.fields import ParentalManyToManyField
@@ -35,6 +41,105 @@ class Page(Request, ClusterableModel):
components = ParentalManyToManyField(Component, related_name="pages")
links = ParentalManyToManyField(Link, related_name="links")
+ def __str__(self):
+ return self.url
+
+ HTML_COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)")
+ HTML_EXTERNAL_SITE = re.compile("/external-site/")
+ HTML_WHITESPACE = re.compile(r"\s+")
+
+ @classmethod
+ def from_html(
+ cls,
+ url,
+ html,
+ internal_link_host,
+ ):
+ try:
+ tree = lxml.html.fromstring(html)
+ except lxml.etree.ParserError:
+ # https://bugs.launchpad.net/lxml/+bug/1949271
+ tree = lxml.html.soupparser.fromstring(html)
+
+ title_tag = tree.find(".//title")
+ title = title_tag.text.strip() if title_tag is not None else None
+ language = tree.find(".").get("lang")
+
+ if title is None:
+ return
+
+ body = cls._get_cleaned_body_from_tree(tree)
+
+ if body is not None:
+ text = cls.HTML_WHITESPACE.sub(" ", body.text_content()).strip()
+ else:
+ text = None
+
+ page = Page(
+ timestamp=timezone.now(),
+ url=url,
+ title=title,
+ language=language,
+ html=html,
+ text=text,
+ )
+
+ if body is None:
+ return page
+
+ hrefs = list(
+ set(
+ href
+ for element, attribute, href, pos in body.iterlinks()
+ if "a" == element.tag and "href" == attribute
+ )
+ )
+
+ # Remove any external link URL wrapping.
+ for i, href in enumerate(hrefs):
+ parsed_href = parse.urlparse(href)
+ if not cls.HTML_EXTERNAL_SITE.match(parsed_href.path):
+ continue
+
+ if parsed_href.netloc and internal_link_host != parsed_href.netloc:
+ continue
+
+ ext_url = parse.parse_qs(parsed_href.query).get("ext_url")
+ if ext_url:
+ hrefs[i] = ext_url[0]
+
+ page.links = [Link(href=href) for href in sorted(hrefs)]
+
+ body_html = lxml.etree.tostring(body, encoding="unicode")
+
+ class_names = set(cls.HTML_COMPONENT_SEARCH.findall(body_html))
+ page.components = [
+ Component(class_name=class_name) for class_name in sorted(class_names)
+ ]
+
+ return page
+
+ @staticmethod
+ def _get_cleaned_body_from_tree(tree):
+ """Extract page body without header, footer, images, or scripts."""
+ body = tree.find("./body")
+
+ if body is not None:
+ drop_element_selectors = [
+ ".o-header",
+ ".o-footer",
+ ".skip-nav",
+ "img",
+ "script",
+ "style",
+ ]
+
+ for drop_element_selector in drop_element_selectors:
+ for element in body.cssselect(drop_element_selector):
+ element.drop_tree()
+
+ return body
+
class ErrorBase(Request):
status_code = models.PositiveIntegerField(db_index=True)
@@ -43,10 +148,24 @@ class ErrorBase(Request):
class Meta(Request.Meta):
abstract = True
+ def __str__(self):
+ s = self.url
+
+ if self.referrer:
+ s += f" (from {self.referrer})"
+
+ s += f" {self.status_code}"
+
+ return s
+
class Error(ErrorBase):
- pass
+ def __str__(self):
+ return super().__str__() + " !"
class Redirect(ErrorBase):
location = models.TextField(db_index=True)
+
+ def __str__(self):
+ return super().__str__() + f" -> {self.location}"
diff --git a/crawler/tests/__init__.py b/crawler/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawler/tests/test_models.py b/crawler/tests/test_models.py
new file mode 100644
index 0000000..c6d1890
--- /dev/null
+++ b/crawler/tests/test_models.py
@@ -0,0 +1,110 @@
+from operator import attrgetter
+from unittest.mock import patch
+
+import lxml.etree
+
+from django.test import SimpleTestCase
+
+from crawler.models import Error, Page, Redirect
+
+
+class PageTests(SimpleTestCase):
+ def test_from_html_no_title_returns_none(self):
+ self.assertIsNone(
+ Page.from_html(
+ "https://example.com/",
+ "
This page has no title.",
+ "example.com",
+ )
+ )
+
+ def check_from_html(self):
+ html = """
+
+Test page
+
+
+ Links
+
+
+
+
+ """.strip()
+
+ page = Page.from_html("https://example.com/", html, "example.com")
+ self.assertEqual(str(page), "https://example.com/")
+ self.assertEqual(page.title, "Test page")
+ self.assertEqual(page.language, "en")
+ self.assertEqual(page.html, html)
+ self.assertEqual(
+ page.text,
+ (
+ "Links "
+ "A regular link on the same domain. "
+ "An external link pointing to another domain "
+ "An external link missing its target "
+ "A link on another domain that also uses /external-site/"
+ ),
+ )
+ self.assertCountEqual(
+ page.components.values_list("class_name", flat=True),
+ ["a-external-link", "m-links"],
+ )
+ self.assertCountEqual(
+ page.links.values_list("href", flat=True),
+ [
+ "/external-site/",
+ "/page/",
+ "https://example.org/",
+ "https://example.org/external-site/",
+ ],
+ )
+
+ def test_from_html(self):
+ self.check_from_html()
+
+ def test_from_html_etree_fallback_parser(self):
+ with patch(
+ "lxml.html.fromstring",
+ side_effect=lxml.etree.ParserError("testing parser error"),
+ ):
+ self.check_from_html()
+
+ def test_from_html_no_body(self):
+ html = 'Test page with no body'
+ page = Page.from_html("https://example.com/", html, "example.com")
+ self.assertEqual(str(page), "https://example.com/")
+ self.assertEqual(page.title, "Test page with no body")
+ self.assertEqual(page.language, "en")
+ self.assertEqual(page.html, html)
+ self.assertIsNone(page.text)
+
+
+class ErrorTests(SimpleTestCase):
+ def test_error_str(self):
+ self.assertEqual(
+ str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !"
+ )
+
+ def test_error_str_with_referrer(self):
+ self.assertEqual(
+ str(
+ Redirect(
+ url="/redirect/",
+ referrer="/source/",
+ status_code=301,
+ location="/destination/",
+ )
+ ),
+ "/redirect/ (from /source/) 301 -> /destination/",
+ )
diff --git a/crawler/wpull_plugin.py b/crawler/wpull_plugin.py
new file mode 100644
index 0000000..af85d08
--- /dev/null
+++ b/crawler/wpull_plugin.py
@@ -0,0 +1,273 @@
+import asyncio
+import logging
+import re
+from urllib import parse
+
+from django.core.management import call_command
+from django.db import connections
+from django.utils import timezone
+
+from wpull.application.hook import Actions
+from wpull.application.plugin import PluginFunctions, WpullPlugin, hook
+from wpull.network.connection import BaseConnection
+from wpull.pipeline.item import URLProperties
+from wpull.url import URLInfo
+
+from crawler.models import Error, Page, Redirect
+from crawler.writer import DatabaseWriter
+
+
+logger = logging.getLogger("crawler")
+
+
+SKIP_URLS = list(
+ map(
+ re.compile,
+ [
+ r"^https://www.facebook.com/dialog/share\?.*",
+ r"^https://twitter.com/intent/tweet\?.*",
+ r"^https://www.linkedin.com/shareArticle\?.*",
+ ],
+ )
+)
+
+HEAD_URLS = list(map(re.compile, [r"https://files.consumerfinance.gov/.*"]))
+
+
+def patch_wpull_connection():
+ """Use wait_timeout instead of close_timeout for readline."""
+
+ @asyncio.coroutine
+ def readline(self):
+ data = yield from self.run_network_operation(
+ self.reader.readline(), wait_timeout=self._timeout, name="Readline"
+ )
+ return data
+
+ BaseConnection.readline = readline
+
+
+class DatabaseWritingPlugin(WpullPlugin):
+ def activate(self):
+ super().activate()
+
+ patch_wpull_connection()
+
+ self.start_url = URLInfo.parse(self.app_session.args.urls[0])
+ self.db_filename, self.max_pages = self.app_session.args.plugin_args.rsplit(
+ ",", maxsplit=1
+ )
+ self.max_pages = int(self.max_pages)
+
+ self.db_writer = self.init_db()
+ self.accepted_urls = []
+ self.requested_urls = []
+
+ def deactivate(self):
+ super().deactivate()
+ self.db_writer.analyze()
+
+ def init_db(self):
+ db_alias = "warc_to_db"
+
+ connections.databases[db_alias] = {
+ "ENGINE": "django.db.backends.sqlite3",
+ "NAME": self.db_filename,
+ }
+
+ call_command("migrate", database=db_alias, app_label="crawler", run_syncdb=True)
+
+ return DatabaseWriter(db_alias)
+
+ @property
+ def at_max_pages(self):
+ return self.max_pages and len(self.requested_urls) >= self.max_pages
+
+ @hook(PluginFunctions.accept_url)
+ def accept_url(self, item_session, verdict, reasons):
+ # If upstream logic rejected this URL, let the rejection stand.
+ if not verdict:
+ return False
+
+ # If we've already crawled enough pages, stop.
+ if self.at_max_pages:
+ return False
+
+ request = item_session.url_record
+
+ # Don't request pages more than once.
+ if request.url in self.requested_urls:
+ return False
+
+ # Always skip certain URLs.
+ if SKIP_URLS and any(skip_url.match(request.url) for skip_url in SKIP_URLS):
+ return False
+
+ # We want to crawl links to different domains to test their validity.
+ # But once we've done that, we don't want to keep crawling there.
+ # Therefore, don't crawl links that start on different domains.
+ if (
+ request.parent_url_info.hostname_with_port
+ != self.start_url.hostname_with_port
+ ):
+ return False
+
+ # Use HEAD requests to speed up the crawl for certain external domains.
+ # We can't do this everywhere because other sites may respond to HEAD
+ # requests in inconvenient ways. This avoids the need to fully download
+ # external responses.
+ if HEAD_URLS and any(head_url.match(request.url) for head_url in HEAD_URLS):
+ item_session.request.method = "HEAD"
+
+ # If we're crawling on the start domain, apply additional rejections.
+ elif request.url_info.hostname_with_port == self.start_url.hostname_with_port:
+ # Don't crawl URLs that look like filenames.
+ if "." in request.url_info.path:
+ return False
+
+ qs = parse.parse_qs(request.url_info.query)
+
+ if qs:
+ # Don't crawl external link URLs directly.
+ # Instead crawl to their ultimate destination.
+ if Page.HTML_EXTERNAL_SITE.match(request.url_info.path):
+ ext_urls = qs.get("ext_url")
+ if ext_urls:
+ # Add the external URL to the list to be crawled.
+ ext_url = ext_urls[0]
+
+ url_properties = URLProperties()
+ url_properties.level = request.level
+ url_properties.inline_level = request.inline_level
+ url_properties.parent_url = request.parent_url
+ url_properties.root_url = request.root_url
+
+ item_session.app_session.factory["URLTable"].remove_many(
+ [ext_url]
+ )
+ item_session.add_url(ext_url, url_properites=url_properties)
+ return False
+
+ # For all other URLs, limit querystrings that get crawled.
+ # Only crawl pages that only have the "page" parameter.
+ elif list(qs.keys()) != ["page"]:
+ return False
+
+ if request.url not in self.accepted_urls:
+ logger.info(f"Crawling {request.url}")
+ self.accepted_urls.append(request.url)
+
+ return True
+
+ @hook(PluginFunctions.handle_error)
+ def handle_error(self, item_session, error):
+ if item_session.request.url in self.requested_urls:
+ logger.debug(f"Already logged error for {item_session.request.url}")
+ else:
+ logger.debug(error)
+ self.db_writer.write(
+ Error(
+ timestamp=timezone.now(),
+ url=item_session.request.url,
+ status_code=0,
+ referrer=item_session.request.fields.get("Referer"),
+ )
+ )
+
+ self.requested_urls.append(item_session.request.url)
+
+ @hook(PluginFunctions.handle_pre_response)
+ def handle_pre_response(self, item_session):
+ # Our accept_url handler converts certain external requests from GET to
+ # HEAD. The wpull response body handler seems to assume that HEAD
+ # request responses will never have Content-Length or Transfer-Encoding
+ # headers, which doesn't seem to be the case in practice:
+ #
+ # https://github.com/ArchiveTeam/wpull/blob/v2.0.1/wpull/protocol/http/stream.py#L441-L451
+ #
+ # Therefore, we strip these headers out if they exist, since we don't
+ # need them for our purposes. Since this was an external request, we
+ # care only about the status code, not the response body.
+ if item_session.request.method == "HEAD":
+ item_session.response.fields.pop("Content-Length", None)
+ item_session.response.fields.pop("Transfer-Encoding", None)
+
+ return Actions.NORMAL
+
+ @hook(PluginFunctions.handle_response)
+ def handle_response(self, item_session):
+ request = item_session.request
+ response = item_session.response
+ status_code = response.status_code
+ timestamp = timezone.now()
+
+ if request.url in self.requested_urls:
+ logger.debug(f"Already logged {request.url}")
+ item_session.skip()
+ return Actions.FINISH
+ else:
+ self.requested_urls.append(request.url)
+
+ if status_code >= 300:
+ referrer = request.fields.get("Referer")
+
+ if status_code < 400:
+ location = response.fields.get("Location")
+ location_parsed = parse.urlparse(location)
+
+ self.db_writer.write(
+ Redirect(
+ timestamp=timestamp,
+ url=request.url,
+ status_code=status_code,
+ referrer=referrer,
+ location=location,
+ )
+ )
+
+ # Don't follow redirects that don't point to the start domain.
+ if (
+ location_parsed.hostname
+ and location_parsed.hostname != self.start_url.hostname
+ ) or (
+ location_parsed.port and location_parsed.port != self.start_url.port
+ ):
+ logger.debug(f"Not following redirect to {location}")
+ item_session.skip()
+ return Actions.FINISH
+ else:
+ self.db_writer.write(
+ Error(
+ timestamp=timestamp,
+ url=request.url,
+ status_code=status_code,
+ referrer=referrer,
+ )
+ )
+
+ return Actions.NORMAL
+
+ # If this request was to an external domain and it responded with
+ # a normal status code, we don't care about recording it.
+ if request.url_info.hostname_with_port != self.start_url.hostname_with_port:
+ item_session.skip()
+ return Actions.FINISH
+
+ page_record = self.process_200_response(request, response)
+
+ if not page_record:
+ logger.debug(f"Unexpected response for {request.url}, skipping")
+ item_session.skip()
+ return Actions.FINISH
+
+ self.db_writer.write(page_record)
+ return Actions.NORMAL
+
+ def process_200_response(self, request, response):
+ content_type = response.fields.get("Content-Type")
+
+ if not (content_type or "").startswith("text/html"):
+ return
+
+ html = response.body.content().decode("utf-8")
+ return Page.from_html(request.url, html, self.start_url.hostname)
diff --git a/crawler/writer.py b/crawler/writer.py
index 1d15f29..caaadb7 100644
--- a/crawler/writer.py
+++ b/crawler/writer.py
@@ -1,8 +1,13 @@
+import logging
+
from django.db import connections
from crawler.models import Component, Link, Page
+logger = logging.getLogger("crawler")
+
+
class DatabaseWriter:
def __init__(self, db):
self.db = db
@@ -12,6 +17,7 @@ def write(self, instance):
if isinstance(instance, Page):
return self._write_page(instance)
else:
+ logger.debug(f"Saving {instance}")
instance.save(using=self.db)
def _write_page(self, page):
@@ -36,6 +42,7 @@ def _write_page(self, page):
.values()
)
+ logger.debug(f"Saving {page}")
page.save(using=self.db)
def analyze(self):
diff --git a/fabfile.py b/fabfile.py
index e2b09d4..0d97059 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -22,7 +22,7 @@
SQLITE_BASENAME = f"sqlite-autoconf-{SQLITE_VERSION}"
SQLITE_INSTALL_ROOT = f"{DEPLOY_ROOT}/{SQLITE_BASENAME}"
-PYTHON_VERSION = "3.8.13"
+PYTHON_VERSION = "3.6.15"
PYTHON_BASENAME = f"Python-{PYTHON_VERSION}"
PYTHON_INSTALL_ROOT = f"{DEPLOY_ROOT}/{PYTHON_BASENAME}"
diff --git a/requirements/base.txt b/requirements/base.txt
index 6573881..91679fe 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -1,12 +1,21 @@
-click==8.1.3
+beautifulsoup4==4.12.2
+click==8.0.4
cssselect==1.1.0
-Django==4.0.7
+Django==3.2.22
django-click==2.3.0
-django-debug-toolbar==3.4.0
-django-filter==22.1
-django-modelcluster==6.0
+django-debug-toolbar==3.2.4
+django-filter==21.1
+django-modelcluster==5.3
djangorestframework==3.13.1
djangorestframework-csv==2.1.1
lxml==4.9.1
warcio==1.7.4
-whitenoise==6.1.0
+whitenoise==5.3.0
+wpull==2.0.1
+
+# wpull doesn't set upper bounds for some of its requirements,
+# so we need to specify these manually:
+# See https://github.com/ArchiveTeam/wpull/blob/v2.0.1/requirements.txt
+html5lib==0.9999999
+sqlalchemy==1.0.12
+tornado==4.5.3
diff --git a/sample/src/index.html b/sample/src/index.html
index 4cd4e03..9e3cf13 100644
--- a/sample/src/index.html
+++ b/sample/src/index.html
@@ -10,5 +10,13 @@
Sample homepage
This is sample content.
This is a link to a child page.
+ This is a link somewhere else.
+ This is an obfuscated link somewhere else.
+ This is another obfuscated link some
+ where else.
+ This links to a file.
+ This links to a file somewhere else.
+ This link has a page query string parameter.
This link has a non-page query string parameter.
+ This link has multiple query string parameters.