diff --git a/Dockerfile b/Dockerfile index 15d5a73..8edef7d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,13 +6,12 @@ ENV LANG en_US.UTF-8 # Disable pip cache dir. ENV PIP_NO_CACHE_DIR 1 +# Allow pip install as root. +ENV PIP_ROOT_USER_ACTION ignore + # Stops Python default buffering to stdout, improving logging to the console. ENV PYTHONUNBUFFERED 1 -# Define app home and workdir. -ENV APP_HOME /usr/src/app -WORKDIR $APP_HOME - # Create a non-root user for the container. ARG USERNAME=app ARG USER_UID=1000 @@ -24,9 +23,12 @@ RUN addgroup \ --uid $USER_UID \ --ingroup $USERNAME \ --disabled-password \ - --no-create-home \ $USERNAME +# Define app home and workdir. +ENV APP_HOME /home/$USERNAME +WORKDIR $APP_HOME + # Copy the whole project except for what is in .dockerignore. COPY --chown=$USERNAME:$USERNAME . . @@ -53,6 +55,11 @@ RUN set -eux; \ ; \ pip install -U pip; \ pip install --no-cache-dir -r requirements/base.txt; \ + # Remove keys that aren't needed by the application but would be + # flagged as a vulnerability by our Docker image scanner. + rm /usr/local/lib/python3.12/site-packages/tornado/test/test.key; \ + rm /usr/local/lib/python3.12/site-packages/wpull/proxy/proxy.key; \ + rm /usr/local/lib/python3.12/site-packages/wpull/testing/test.pem; \ apk del .backend-deps # Build the frontend. @@ -71,7 +78,6 @@ RUN set -eux; \ rm -rf ./node_modules; \ apk del .frontend-deps - # Run the application with the user we created. USER $USERNAME diff --git a/README.md b/README.md index 8dc1f99..0049b58 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ and export results as CSV or JSON reports. ## Crawling a website +### Using a Python virtual environment + Create a Python virtual environment and install required packages: ``` @@ -37,6 +39,23 @@ Crawl a website: ./manage.py crawl https://www.consumerfinance.gov crawl.sqlite3 ``` +### Using Docker + +To build the Docker image: + +``` +docker build -t website-indexer:main . +``` + +Crawl a website: + +``` +docker run -it \ + -p 8000:8000 \ + -v `pwd`:/data website-indexer:main \ + python manage.py crawl https://www.consumerfinance.gov /data/crawl.sqlite3 +``` + ## Searching the crawl database You can use the @@ -129,6 +148,8 @@ sqlite> SELECT url FROM crawler_page WHERE html LIKE "%
%" ORDER BY URL asc; ## Running the viewer application +### Using a Python virtual environment + From the repo's root, compile frontend assets: ``` @@ -165,6 +186,31 @@ Finally, run the Django webserver: The viewer application will be available locally at http://localhost:8000. +### Using Docker + +To build the Docker image: + +``` +docker build -t website-indexer:main . +``` + +To run the image using sample data: + +``` +docker run -it -p 8000:8000 website-indexer:main +``` + +To run the image using a local database dump: + +``` +docker run \ + -it \ + -p 8000:8000 \ + -v /path/to/local/dump:/data \ + -e CRAWL_DATABASE=/data/crawl.sqlite3 \ + website-indexer:main +``` + ## Development ### Testing diff --git a/requirements/base.txt b/requirements/base.txt index 74a17da..43b4ead 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,12 +1,12 @@ beautifulsoup4==4.12.2 click==8.0.4 cssselect==1.1.0 -Django==3.2.22 +Django==3.2.25 django-click==2.3.0 django-debug-toolbar==3.2.4 django-filter==21.1 django-modelcluster==5.3 -djangorestframework==3.13.1 +djangorestframework==3.15.1 djangorestframework-csv==2.1.1 whitenoise==5.3.0