From ebc94e60c30d54399171ad448621a0d7e77261ef Mon Sep 17 00:00:00 2001
From: Andy Chosak <andy.chosak@cfpb.gov>
Date: Thu, 2 Nov 2023 09:51:45 -0400
Subject: [PATCH 1/2] Update README and test database

This change updates the repository README due to the recent rename
from "crawsqueal" to "website-indexer".

It also documents the new wpull-based crawler added in PR 81.

Additionally, it updates the test database with some test data that
should have come along with that PR.
---
 README.md             |  78 ++++++++++--------------------------------
 sample/sample.sqlite3 | Bin 139264 -> 135168 bytes
 2 files changed, 18 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index 7a660cb..2e84ae7 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,8 @@
-# crawsqueal = "crawl" + "SQL" 🦜
+# website-indexer 🪱
 
-Explore a website archive in your browser.
+This repository crawls a website and stores its content in a SQLite database file.
 
-First, you'll need a
-[Website ARChive (WARC) file](https://archive-it.org/blog/post/the-stack-warc-file/)
-generated by crawling your website of interest. This repository contains
-[one method to run a crawler](#generating-a-crawl-database-from-a-warc-file),
-although numerous other popular tools exist for this purpose. Alternatively,
-you can use an existing WARC from another source, for example the
-[Internet Archive](https://archive.org/search.php?query=mediatype%3A%28web%29).
-
-Next, use this repository to convert your WARC file into a SQLite database file
-for easier querying. Use the SQLite command-line interface to
+Use the SQLite command-line interface to
 [make basic queries](#searching-the-crawl-database)
 about website content including:
 
@@ -24,44 +15,26 @@ about website content including:
 - Crawler errors (404s and more)
 - Redirects
 
-Finally,
-[run the viewer application](#running-the-viewer-application)
-in this repository to explore website content in your browser.
+This repository also contains a Django-based
+[web application](#running-the-viewer-application)
+to explore crawled website content in your browser.
 Make queries through an easy-to-use web form, review page details,
 and export results as CSV or JSON reports.
 
-## Generating a crawl database from a WARC file
-
-A [WARC](https://archive-it.org/blog/post/the-stack-warc-file/)
-(Web ARChive) is a file standard for storing web content in its original context,
-maintained by the International Internet Preservation Consortium (IIPC).
-
-Many tools exist to generate WARCs.
-The Internet Archive maintains the
-[Heritrix](https://github.com/internetarchive/heritrix3) web crawler that can generate WARCs;
-a longer list of additional tools for this purpose can be found
-[here](http://dhamaniasad.github.io/WARCTools/).
+## Crawling a website
 
-The common command-line tool
-[wget](https://wiki.archiveteam.org/index.php/Wget_with_WARC_output)
-can also be used to generate WARCs. A sample script to do so can be found in this repository,
-and can be invoked like this:
+Create a Python virtual environment and install required packages:
 
-```sh
-./wget_crawl.sh https://www.consumerfinance.gov/
 ```
-
-This will generate a WARC archive file named `crawl.warc.gz`.
-This file can then be converted to a SQLite database using a command like:
-
-```sh
-./manage.py warc_to_db crawl.warc.gz crawl.sqlite3
+python3.6 -m venv venv
+source venv/bin/activate
+pip install -r requirements/base.txt
 ```
 
-Alternatively, to dump a WARC archive file to a set of CSVs:
+Crawl a website:
 
 ```sh
-./manage.py warc_to_csv crawl.warc.gz
+./manage.py crawl https://www.consumerfinance.gov crawl.sqlite3
 ```
 
 ## Searching the crawl database
@@ -174,7 +147,7 @@ pip install -r requirements/base.txt
 Optionally set the `CRAWL_DATABASE` environment variable to point to a local crawl database:
 
 ```
-export CRAWL_DATABASE=cfgov.sqlite3
+export CRAWL_DATABASE=crawl.sqlite3
 ```
 
 Finally, run the Django webserver:
@@ -237,13 +210,12 @@ yarn fix
 
 ### Sample test data
 
-This repository includes sample web archive and database files for testing
-purposes at `/sample/crawl.warc.gz` and `/sample/sample.sqlite3`.
+This repository includes a sample database file for testing purposes at `/sample/sample.sqlite3`.
 
 The sample database file is used by the viewer application when no other crawl
 database file has been specified.
 
-The source website content used to generate these files is included in this repository
+The source website content used to generate this file is included in this repository
 under the `/sample/src` subdirectory.
 To regenerate these files, first serve the sample website locally:
 
@@ -256,24 +228,10 @@ This starts the sample website running at http://localhost:8000.
 Then, in another terminal, start a crawl against the locally running site:
 
 ```
-./wget_crawl.sh http://localhost:8000
-```
-
-This will create a WARC archive named `crawl.warc.gz` in your working directory.
-
-Next, convert this to a test database file:
-
-```
-./manage.py warc_to_db crawl.warc.gz sample.sqlite3
+./manage.py crawl http://localhost:8000/ --recreate ./sample/src/sample.sqlite3
 ```
 
-This will create a SQLite database named `sample.sqlite3` in your working directory.
-
-Finally, use these newly created files to replace the existing ones in the `/sample` subdirectory:
-
-```
-mv crawl.warc.gz sample.sqlite3 ./sample
-```
+This will overwrite the test database with a fresh crawl.
 
 ## Deployment
 
diff --git a/sample/sample.sqlite3 b/sample/sample.sqlite3
index 01b0e9b6703327fad0bf461dc31ca67c3b5bd1e8..c5c44fee30664fbf5ebd52ebc2dfd850e28176a9 100644
GIT binary patch
delta 2618
zcmc&$eP~-%6u<Y~``*jTW>2!wZQ6ajZgh3b<mIEKP3Z>#UAKQ6()pKvl-KsPiA(b8
z%PZ^t$_g@25H~X%2%YZZ4<`tgh_X$^DH~4Q@B`Td?O^H<*_>Zfn8G|aP1-a`r~es}
zbKbk><DPSV=boE=?|b&W;yKXdUrq>V#$Wlf=HNq7c{vFVuA3P-Vl;e1I2<ML6BOVa
z94+{L2VskkvsP``G54e_XV7vS+wn=Pg_c*w%`uCuVk}G#$6ul=IePZzU+r7p<h>6c
zHP}wzXZQ|2g?Hf?JOg9!C~TjZ3PO%+jDWG)!$^C=b;q+^hG9*b>G6~mw4E_SR~hNA
z77W{IV>F2xld0_FYNd{;(70b_BxVdJlBp4+PsG;~**t37JwvAB=ZtJAiU#IoD;OhN
z%jzXPuf#}GX{~`WvMIQ*ueX@j4g$Zx4)`8EgZJP#JPQffIWuKI{??$J)7G$dS{=N|
z%G1pHYXpn3QtO!t4dnqAqvbFcXC<#-TPj6y%$ToaTj$qGS+QlTsk|mNuuWP;Rst=?
z&k${!%K4d@sn>by)7zMr#0L!j{(-3tkx(QW2!{iqh#ro1wY7J(g@U1Qdn6XiU&yUh
zOLabeX}7ewPeQXtmN1KU21C(ssI5bo6)x?*OIx4oplA@=>d)`ozo}-QXh*QEqoboU
zT)3HjhvpM@LuYkmMR_*%?n)%HI(|((mCWqcvv%6rldv31w^CUv*ln1-dDC8%vnE~3
z$(X4?HtAYMAHEJwIH{hZ&z5MsC9(~_6`hE08}$*>H3Q?0<+^(V3C9}k*;pNeK(``>
zjBXE&PGpBo*BV(A&+`8(Alt=a)QM_U+Vbl7sjXGx=8FC(<t9p-)m<B_Ts(-SvNqoP
zsMa@L0;09KQZrRp=?PN+E81RvYQl2%>RH!GX2x*EG1C?zS_HV;81L2f?(yDkQ=fyW
zd|*l^pw?x^&IGF5`fL1R<E-tg8YR`P(i15+iPLL|__FzFdpM`xyGa+sOZHE>eJWI!
zLN;wxf+`Gh*<vqN0yyWV3f5|57L@)25H-pRCEisZx+;r*t7K42U<T6!uH!W|jn~xk
zz*^Zr{-fMFPYESfIG5}5t?|Ms0(amyxPg;;7B1sGbOFx7DYTh}#}qFolrl}x1eY|~
z!-+;cvMQJ=D}pJqEErP3xXc7&vLu+qDKbrrP6{UpoJ6-TgZVgBNW5Q))sC{xU@Q~~
z$J(lg$D$j69ui`_4pa89nM&AMx2rQ03K=463RekS#cjTXW}WqG_jGj3r@x?cFM|^V
zuEB?J0_|ReC)cw$tqE;@nzOp%`Kc|fV{xs<dVWUFA#;{srO(1m+^+!n%Xb?oLmI$H
zZj{hK$t*M~%ulThn`4s9{F;^V7O3y;hGV)9Lr3^d6Zi{$hnooXRk(te?Z@x|oJR8_
zFr;!S(DN3#q-hG*DD`-XOjV0aQHo4PEK$aJkug?elEk^DX(D7VTqN)}{0YC}PCvl6
z*eYMa7jO~xnSq@$!)?5RN*ZU_AcClZD1yj>a6y<L5@Thj=Y`Wh@c++`_z!3n!~YyU
zf%E9}HF#LK(~3?tO+0%%f~u;ZiXx~i3(B$OaC;`GH1C_jHN+G>U4aMdBT{a)xvsf`
zbw}-6pc0u$8a?#Z0m^*}h3j~TQ1CV6ujkgY75w&L^$prGGj)N#efv$i?1_H>!kyzR

delta 911
zcmZvaUr1A77{<SUn>(BCJ*VZSEZadE6sg<Jm`oGQWJOpvDRkk5OdUE}qjXS2pd*u@
zo3%j)BE=HQ%WR2cpoK^m;e|JOS5RaWIYHBl3f6bFDGU4H@ExA#_dU;h@eWTIhF=>J
zc1b`8Rr7E6RNkxc>I78Rx;WiC?katYD9j+3fegG)OX5>VWtyeagwwG5g5Oq3J-BBe
zcr6kd=(-Z@3w1@p(My9wB-HtjPRck<AcNp1WO&{uILrt|;4)&0?5>{vaHQ8|%rP4X
zHej1GpP@=e0lSEi+Jyp=i!mip*|I4)9UnExBYt8;-hfLQZI3$?x1!X#{k3kNthgJM
zhDLXN>e0}l^qIaamPg!}lMTnE(Tj2We$vNDYx-9B2~Mr|+fp~9Crk75EFoBfB|gtf
zpfU}lCWflgUxpl_fxsq$U$DuoH8AT5Ll_gnFhhdyT*Pu3i&;lfW*l5W@C{ZtxCAF`
z)Qk7m*ApxC3j1|meuz~BtFX=k-a+b3wmkJCTft+uwEiuwe&zjC!l5Y9QAr3zjk)f}
zvY4;u-0uQcS;DY3CSgnvgnal8f<LgsW4?n`M>Uty|9GY%ZxW00EKXo5PO@Q<TG*6Y
z@(eo(o0^W_VCUN$dYjXUJ7dA_NNDhCBpeHM4aR~oPw{_s>xDo|M?mgqIdv`|7duB|
z-I1Db@AVq(&9T|-7ueL9Ot6Cnvme1Sj6oQDfX$2M+h+fCTsG_549s)crn(;BBkIRF
zEU3+|aJfm5J+h~%gn8O6dzK_{wMoCH-Jio!IhPb2bGBJ_S6SO)H8GAIIg1rr+Lmip
vrTSqE>$Nn6YQ8%E5Lc*o7qCUs>IieT7Bu4m22Jx|<~&=_?#$x@RFRdxE)MBe


From 47d0356e5aacc17f739a26ab9b0f18ee58906b44 Mon Sep 17 00:00:00 2001
From: Andy Chosak <andy.chosak@cfpb.gov>
Date: Thu, 2 Nov 2023 10:38:48 -0400
Subject: [PATCH 2/2] Fix tests that rely on test database

---
 viewer/tests/test_csv_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viewer/tests/test_csv_export.py b/viewer/tests/test_csv_export.py
index e6e9015..049c5b5 100644
--- a/viewer/tests/test_csv_export.py
+++ b/viewer/tests/test_csv_export.py
@@ -12,5 +12,5 @@ def test_csv_generation(self):
         self.assertEqual(response["Content-Type"], "text/csv; charset=utf-8")
 
         rows = BytesIO(response.getvalue()).readlines()
-        self.assertEqual(len(rows), 3)
+        self.assertEqual(len(rows), 4)
         self.assertEqual(rows[0], codecs.BOM_UTF8 + b"url,title,language\r\n")