From ebc94e60c30d54399171ad448621a0d7e77261ef Mon Sep 17 00:00:00 2001 From: Andy Chosak Date: Thu, 2 Nov 2023 09:51:45 -0400 Subject: [PATCH 1/2] Update README and test database This change updates the repository README due to the recent rename from "crawsqueal" to "website-indexer". It also documents the new wpull-based crawler added in PR 81. Additionally, it updates the test database with some test data that should have come along with that PR. --- README.md | 78 ++++++++++-------------------------------- sample/sample.sqlite3 | Bin 139264 -> 135168 bytes 2 files changed, 18 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 7a660cb..2e84ae7 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,8 @@ -# crawsqueal = "crawl" + "SQL" 🦜 +# website-indexer 🪱 -Explore a website archive in your browser. +This repository crawls a website and stores its content in a SQLite database file. -First, you'll need a -[Website ARChive (WARC) file](https://archive-it.org/blog/post/the-stack-warc-file/) -generated by crawling your website of interest. This repository contains -[one method to run a crawler](#generating-a-crawl-database-from-a-warc-file), -although numerous other popular tools exist for this purpose. Alternatively, -you can use an existing WARC from another source, for example the -[Internet Archive](https://archive.org/search.php?query=mediatype%3A%28web%29). - -Next, use this repository to convert your WARC file into a SQLite database file -for easier querying. Use the SQLite command-line interface to +Use the SQLite command-line interface to [make basic queries](#searching-the-crawl-database) about website content including: @@ -24,44 +15,26 @@ about website content including: - Crawler errors (404s and more) - Redirects -Finally, -[run the viewer application](#running-the-viewer-application) -in this repository to explore website content in your browser. +This repository also contains a Django-based +[web application](#running-the-viewer-application) +to explore crawled website content in your browser. Make queries through an easy-to-use web form, review page details, and export results as CSV or JSON reports. -## Generating a crawl database from a WARC file - -A [WARC](https://archive-it.org/blog/post/the-stack-warc-file/) -(Web ARChive) is a file standard for storing web content in its original context, -maintained by the International Internet Preservation Consortium (IIPC). - -Many tools exist to generate WARCs. -The Internet Archive maintains the -[Heritrix](https://github.com/internetarchive/heritrix3) web crawler that can generate WARCs; -a longer list of additional tools for this purpose can be found -[here](http://dhamaniasad.github.io/WARCTools/). +## Crawling a website -The common command-line tool -[wget](https://wiki.archiveteam.org/index.php/Wget_with_WARC_output) -can also be used to generate WARCs. A sample script to do so can be found in this repository, -and can be invoked like this: +Create a Python virtual environment and install required packages: -```sh -./wget_crawl.sh https://www.consumerfinance.gov/ ``` - -This will generate a WARC archive file named `crawl.warc.gz`. -This file can then be converted to a SQLite database using a command like: - -```sh -./manage.py warc_to_db crawl.warc.gz crawl.sqlite3 +python3.6 -m venv venv +source venv/bin/activate +pip install -r requirements/base.txt ``` -Alternatively, to dump a WARC archive file to a set of CSVs: +Crawl a website: ```sh -./manage.py warc_to_csv crawl.warc.gz +./manage.py crawl https://www.consumerfinance.gov crawl.sqlite3 ``` ## Searching the crawl database @@ -174,7 +147,7 @@ pip install -r requirements/base.txt Optionally set the `CRAWL_DATABASE` environment variable to point to a local crawl database: ``` -export CRAWL_DATABASE=cfgov.sqlite3 +export CRAWL_DATABASE=crawl.sqlite3 ``` Finally, run the Django webserver: @@ -237,13 +210,12 @@ yarn fix ### Sample test data -This repository includes sample web archive and database files for testing -purposes at `/sample/crawl.warc.gz` and `/sample/sample.sqlite3`. +This repository includes a sample database file for testing purposes at `/sample/sample.sqlite3`. The sample database file is used by the viewer application when no other crawl database file has been specified. -The source website content used to generate these files is included in this repository +The source website content used to generate this file is included in this repository under the `/sample/src` subdirectory. To regenerate these files, first serve the sample website locally: @@ -256,24 +228,10 @@ This starts the sample website running at http://localhost:8000. Then, in another terminal, start a crawl against the locally running site: ``` -./wget_crawl.sh http://localhost:8000 -``` - -This will create a WARC archive named `crawl.warc.gz` in your working directory. - -Next, convert this to a test database file: - -``` -./manage.py warc_to_db crawl.warc.gz sample.sqlite3 +./manage.py crawl http://localhost:8000/ --recreate ./sample/src/sample.sqlite3 ``` -This will create a SQLite database named `sample.sqlite3` in your working directory. - -Finally, use these newly created files to replace the existing ones in the `/sample` subdirectory: - -``` -mv crawl.warc.gz sample.sqlite3 ./sample -``` +This will overwrite the test database with a fresh crawl. ## Deployment diff --git a/sample/sample.sqlite3 b/sample/sample.sqlite3 index 01b0e9b6703327fad0bf461dc31ca67c3b5bd1e8..c5c44fee30664fbf5ebd52ebc2dfd850e28176a9 100644 GIT binary patch delta 2618 zcmc&$eP~-%6u2!wZQ6ajZgh3b#UAKQ6()pKvl-KsPiA(b8 z%PZ^t$_g@25H~X%2%YZZ4<`tgh_X$^DH~4Q@B`Td?O^H<*_>Zfn8G|aP1-a`r~es} zbKbk>V#$Wlf=HNq7c{vFVuA3P-Vl;e1I26c zHP}wzXZQ|2g?Hf?JOg9!C~TjZ3PO%+jDWG)!$^C=b;q+^hG9*b>G6~mw4E_SR~hNA z77W{IV>F2xld0_FYNd{;(70b_BxVdJlBp4+PsG;~**t37JwvAB=ZtJAiU#IoD;OhN z%jzXPuf#}GX{~`WvMIQ*ueX@j4g$Zx4)`8EgZJP#JPQffIWuKI{??$J)7G$dS{=N| z%G1pHYXpn3QtO!t4dnqAqvbFcXC<#-TPj6y%$ToaTj$qGS+QlTsk|mNuuWP;Rst=? z&k${!%K4d@sn>by)7zMr#0L!j{(-3tkx(QW2!{iqh#ro1wY7J(g@U1Qdn6XiU&yUh zOLabeX}7ewPeQXtmN1KU21C(ssI5bo6)x?*OIx4oplA@=>d)`ozo}-QXh*QEqoboU zT)3HjhvpM@LuYkmMR_*%?n)%HI(|((mCWqcvv%6rldv31w^CUv*ln1-dDC8%vnE~3 z$(X4?HtAYMAHEJwIH{hZ&z5MsC9(~_6`hE08}$*>H3Q?0<+^(V3C9}k*;pNeK(``> zjBXE&PGpBo*BV(A&+`8(Alt=a)QM_U+Vbl7sjXGx=8FC(RH!GX2x*EG1C?zS_HV;81L2f?(yDkQ=fyW zd|*l^pw?x^&IGF5`fL1ReJWI! zLN;wxf+`Gh*a6yU4aMdBT{a)xvsf` zbw}-6pc0u$8a?#Z0m^*}h3j~TQ1CV6ujkgY75w&L^$prGGj)N#efv$i?1_H>!kyzR delta 911 zcmZvaUr1A77{(BCJ*VZSEZadE6sgJ zc1b`8Rr7E6RNkxc>I78Rx;WiC?katYD9j+3fegG)OX5>VWtyeagwwG5g5Oq3J-BBe zcr6kd=(-Z@3w1@p(My9wB-HtjPRck{vaHQ8|%rP4X zHej1GpP@=e0lSEi+Jyp=i!mip*|I4)9UnExBYt8;-hfLQZI3$?x1!X#{k3kNthgJM zhDLXN>e0}l^qIaamPg!}lMTnE(Tj2We$vNDYx-9B2~Mr|+fp~9Crk75EFoBfB|gtf zpfU}lCWflgUxpl_fxsq$U$DuoH8AT5Ll_gnFhhdyT*Pu3i&;lfW*l5W@C{ZtxCAF` z)Qk7m*ApxC3j1|meuz~BtFX=k-a+b3wmkJCTft+uwEiuwe&zjC!l5Y9QAr3zjk)f} zvY4;u-0uQcS;DY3CSgnvgnal8fUty|9GY%ZxW00EKXo5PO@QxDo|M?mgqIdv`|7duB| z-I1Db@AVq(&9T|-7ueL9Ot6Cnvme1Sj6oQDfX$2M+h+fCTsG_549s)crn(;BBkIRF zEU3+|aJfm5J+h~%gn8O6dzK_{wMoCH-Jio!IhPb2bGBJ_S6SO)H8GAIIg1rr+Lmip vrTSqE>$Nn6YQ8%E5Lc*o7qCUs>IieT7Bu4m22Jx|<~&=_?#$x@RFRdxE)MBe From 47d0356e5aacc17f739a26ab9b0f18ee58906b44 Mon Sep 17 00:00:00 2001 From: Andy Chosak Date: Thu, 2 Nov 2023 10:38:48 -0400 Subject: [PATCH 2/2] Fix tests that rely on test database --- viewer/tests/test_csv_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viewer/tests/test_csv_export.py b/viewer/tests/test_csv_export.py index e6e9015..049c5b5 100644 --- a/viewer/tests/test_csv_export.py +++ b/viewer/tests/test_csv_export.py @@ -12,5 +12,5 @@ def test_csv_generation(self): self.assertEqual(response["Content-Type"], "text/csv; charset=utf-8") rows = BytesIO(response.getvalue()).readlines() - self.assertEqual(len(rows), 3) + self.assertEqual(len(rows), 4) self.assertEqual(rows[0], codecs.BOM_UTF8 + b"url,title,language\r\n")