From b67924cca8fa9fc6298b3f5ef976f2f770561a28 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 18:29:44 +0100 Subject: [PATCH] Add Jupyter Book documentation --- .github/workflows/ci_cd.yaml | 27 ++- .gitignore | 1 + Makefile | 2 +- docker/docs.Dockerfile | 5 - docs/_config.yml | 22 ++ docs/_static/style.css | 2 + docs/_toc.yml | 5 + docs/{Home.py => intro.md} | 16 +- docs/logo.png | Bin 0 -> 10188 bytes docs/pages/Methodology.py | 301 --------------------------- docs/pages/Validation.py | 88 -------- docs/utils.py | 7 + policyengine_uk_data/utils/github.py | 7 +- pyproject.toml | 3 +- 14 files changed, 71 insertions(+), 415 deletions(-) delete mode 100644 docker/docs.Dockerfile create mode 100644 docs/_config.yml create mode 100644 docs/_static/style.css create mode 100644 docs/_toc.yml rename docs/{Home.py => intro.md} (81%) create mode 100644 docs/logo.png delete mode 100644 docs/pages/Methodology.py delete mode 100644 docs/pages/Validation.py create mode 100644 docs/utils.py diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 4df3e8c..0e7afc5 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -30,6 +30,29 @@ jobs: user: __token__ password: ${{ secrets.PYPI }} skip-existing: true + publish-docs: + name: Publish documentation + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for all tags and branches + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package + run: pip install -e ".[dev]" + - name: Build Jupyter Book + run: make documentation + - name: Deploy documentation + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: gh-pages + FOLDER: docs/_build/html lint: runs-on: ubuntu-latest name: Lint @@ -47,7 +70,7 @@ jobs: run: black . -l 79 --check test: - name: Build and Test + name: Build and test runs-on: ubuntu-latest steps: - name: Checkout code @@ -68,6 +91,8 @@ jobs: run: make data - name: Run tests run: pytest + - name: Test documentation builds + run: make documentation check-version: name: Check version if: github.event_name == 'pull_request' diff --git a/.gitignore b/.gitignore index e417f9d..61b49b8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ !incomes.csv !tax_benefit.csv !demographics.csv +**/_build diff --git a/Makefile b/Makefile index ed38cdd..f78fd5b 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ docker: docker buildx build --platform linux/amd64 . -t policyengine-uk-data:latest documentation: - streamlit run docs/Home.py + jb clean docs && jb build docs data: python policyengine_uk_data/datasets/frs/dwp_frs.py diff --git a/docker/docs.Dockerfile b/docker/docs.Dockerfile deleted file mode 100644 index 090ca7e..0000000 --- a/docker/docs.Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:latest -COPY . . -RUN make install -EXPOSE 8080 -ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"] diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..711cc60 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,22 @@ +title: PolicyEngine UK data +author: PolicyEngine +copyright: "2024" +logo: logo.png + +execute: + execute_notebooks: off + +repository: + url: https://github.com/policyengine/policyengine-uk-data + branch: master + path_to_book: docs + +sphinx: + config: + html_js_files: + - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.7/require.min.js + html_theme: furo + pygments_style: default + html_css_files: + - style.css + nb_remove_code_source: true \ No newline at end of file diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..e511f94 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,2 @@ +@import url('https://fonts.googleapis.com/css2?family=Roboto+Serif:opsz@8..144&family=Roboto:wght@300&display=swap'); + diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000..4b8640a --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,5 @@ +format: jb-book +root: intro +chapters: +- file: methodology.ipynb +- file: validation.ipynb diff --git a/docs/Home.py b/docs/intro.md similarity index 81% rename from docs/Home.py rename to docs/intro.md index 674f08e..af04a85 100644 --- a/docs/Home.py +++ b/docs/intro.md @@ -1,16 +1,5 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) +# Introduction -download_data() - -st.set_page_config(layout="wide") - -st.title("PolicyEngine-UK-Data") - -st.write( - """ PolicyEngine-UK-Data is a package that creates representative microdata for the UK, designed for input in the PolicyEngine tax-benefit microsimulation model. This tool allows users to explore the data sources, validation processes, and enhancements @@ -22,5 +11,4 @@ * An accurate representation of the current UK household sector *now*. This repository is dedicated to the second of those. In this documentation, we'll explain how we do that, but we'll also use our model (the first bullet) to see what we end up with when we combine the two, and measure up against other organisations doing the same thing. -""" -) + diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..12736e4dce8158bb6ed2557a2bf9d0856811758e GIT binary patch literal 10188 zcmd5?_divC{J+^Fgp@KXD=Sg4M+(7Dak3Ok3EL!r2nD3~2_tiu%TSLinlj4$ou(w#vd2fn8 zv={V2=i{s%T15&QsS|e;rdUcLdcSPf_49o@HO);#va{3}qa%=N+>UbYYlM3-!6fQ-CVxAd+QXv zdrLCQR{zB*F+$;Vb_(#ljwr;k7H?#jG7dwV5ULBD$ge1JRqGNgZ10Tfx~{ufiaeYJ z@r_XH6QMc!arP%te79p{QmUrwRW41yUktcap(fU_;QmF0+cnNbQj%GV`RU-A3|Zc* z;Tv%y@!hA-EUz~5Brv2aHxwn5CM&Fb>*ibN=j1_~_v%2yLOND9KQc``t+Nh?EZgw- zV5u=TL#W92TaLYLgxxt?n^kN~vBF!8UIn|ol3BjH8gMn%_H!VbIL%{kYtZLjzMAE# zV5Y?nBQS3}k1dHW(y3FRgV+)H1k6~RQgUnNXlX0xg&OFnmD+=N)jMc!`$dDY*)yNw z+IXl~Nhg|xy)vh7f!i*{!vevP#21M3~X z9yfdTv{K_WJG>s^<>S9Q;gI zYTmiHDSI@d{dVL}_nh>WJP7278l&Ev;y`+$zwXVC=+Z>j;}$~qMozwFt?-^u3t*cX zVV@2ISMxw(bGFX{K|ZE9OMA)<70Id2lN_eEMxo6w{VmD%M=)v!TTv#z_SP<0CaKoT zpFE%46gvk3l@5PZg|(OOOUmTl-YcpuL)XMC*P0~Tm;WgynsoIE5O|$ovhCb#L6EsU zS7Zq!g%MVFmz(a2R@vhXh`sE4A$ZyI1e*&W(64}>=jR$W()xF_P|Ee4d%R7iKSYcn z&bE9d;L~o8UF*ucE`y&)0@ePUu^TNKd=-=K ztL(Skw+P1wQ92WXWvFV-fr6{A^0e3GXc;D{@6Wtnh{&56`sg_-oW-eo2Q|0IKe+u> zl*65k%Qa1UI!4VM1ma~XrJ0+wv)mamb+hGB9-k^91Pvx$f1hLi&`#&ef|=wB{`&Yx z+MR#Ww@PW}BGApgyCRn_2F;h^n#c8e9@iMZ6DY7uJK)d}loZ;gy{LO-y_Hu61d97u zMY*|Qvfeu^%nD00^oJsw73P)BITv99^T*>Pl#RUe4OppaZ?`jmKy!tVkO`?It%*mj zV-BaLMV_auvNvtxxuRfVKaCyp4UVb*$>k@7SOp?;R}>;e(gql?E2ljL*JXCJbGfZ4 zI3vKu?{2pgM+N2^#LrYA$S2w70K(lI9#$34QHbP!>>i;{jQ+hs#4_$y9bUH`=$?4} zUoX$``YCk#@H$cBv_a8a+BVmPtZ}@>*%-!kcl7R1=}4(A%F%tVsQe+#x3VA}y7MRq z=U?*-UYUfNA4x0f=IPGqyGzN7&HhSldp?~bykB&&8@x?L-sNOmI3D?1rek!#^4K!E zZL&Ld%qybb%eL2wk&dy9)|H65*~3&p-|8Cdriv#WD3>c3BzPNhG;dnr`tbTDd7bF(0e= zP!KuXMYL&)S_;p)G^UbJe9FY%-4~47N=Qr&o8Xo@3sJ3_n8on1z%W?Lmtb?!i1O1C z6~JakQ0&K~w;=Y?CVkS5y%Jw-wl!^s~0^;H3UyE0Z0GQ9n6uowv;M(aXk`b_W<0RF{5TU=-q>!c?-L zN8u@D$!o4wHg5v??c^VG2*pvG`PZ=*lPS?X7hcSGD>7lY2ie~v%+ucI_e1Y=&VGJt}8isubbg1%+jSPb`~&1AJ(-Gy1{cE8oU`ppl-ubCT3 zul%wmP$8S_cTAG#{pndkm@xIJSf0AaKQ5Px$An~XPOIo#VNrx zKp@q)F+Fo0l=4HGD8Mab+2)>x+21&Bi-5r_3hzH2x64hNKIKkMQIP1y$m2$G$2Q86 z_TrYxj^4Ai*v?Vc?h`daf7DSmpCyh#rG?6FHyHP{QkijTbnQ_OqsT3#$fj){=H_i1 z@BW{sX-S8`c|%XF1;258Tac^OeX$pT@%E5>+@bF0`!tc}9}c$~b4+alO>Z9O8y6dqHSEV44<#&zdBfF$ zqt?m4KQ(DPJ`mdc+TB2vS7_t6)AWTWJ#P^^)42Z``tccKXmobKza)z?!>cN1KiYd} zyz{KX`U}oaV5GC?<%1IENG%UvVhGAZO}G=1AZd{l?Yhgki9C4htzymF;$4;-?mOzbPTina8{<%OHd{b_Ui%d}0=&e@E9 z9Jrr}NA%}C$w44{m;1cF97X;4R1JE*`^E0>eIW5g$a@b95-iPv%L`%GLnda7UXDZ3 z9$@HZ8O`+mk!_~c&w{w($GhU2Aq)X5j*2xk7DFiS^;#Y~_5trt%k7QN87)7BF};?j z&KH*XQR^PX{f}uVe^#p_Ix_fzBj+C5>EEG6)SKr`Y5r&lnEPbNuMzp0dS(iqgXgoH zR~g3;qBz$Vn4X%Hv(dJkf$2igHFa$HS6!r&dzw>)oj#d)|7e16)JD5pUpk2Y8_gCM z3tQ1_{KdP|KbbtesBboXHYM4Z1;pbxPo#SF*PKE}lbteswYQr3X{Z(qFvy+uAb;j(-q@|QdJcP7tqk|}RQW#f8}@t-L>A_b(CfX@Aq zgA!|6BwGs{dU$=GVd09;#DWL!wZZa?FH{qY8w&IVsx@UIu}bQzj?-!v-?j0i_K0IQ z$S1FNpHF#P_D-d~iv&R1m%OO^GW>!IXhFK0AFXOLNTq#4F>FF~lnqov6GV5<`n9-pB+vSS z@>V#6abL|nh;#-gQasMLMSRdcEI0KhP4FiQDuqN9dOFD7Sdb;%G!r<(Rf5!~6x)YP zxNoo7-27sSt-MP_@3`7rXD)E-TSF4+HjI9_2m?I_&}o^lY^b{7_&g^IWP%64_vHxM+|?oxO?|}QZ^d|q0Baj zoa)6gglEfIO0ViNT0x5M8_OHCGnpFx%V2&lu=}sW%)Toecc`ZL`HtM8<=`Rek~Wh+ zbDpf(iMtO#h3o3%iE9)>$&ssCnS~|N5e_jIz5Z!<1t64I;Xlix1MtZjsM9wi-_ry> zN3TW8rx^OQB*yt>SBV>0cXIB%`|fnf#{8$`*l^dSXaBA>H!94pA6E;)YgsY*NQ;BD zADBxYb*ydQJKw)pPlgg>n=g7>XDFr;ETCmd7moMNJ0R!Kyf73?nQL4zp4Xp9Yo9l@ zN($h6-g~^gi+xh=5y%(WP;uNbmVaEy_GhPje}??lq&wSKU#z78QS#~jpEI&H~ zogpl@gTlP@3b=4B?j@Mjew`Q#%)8aR`K-3|a_GnXQ$=mpt^s3kQSL_XwOdJLjg-ZN zUj=rH*mR1n!e%nr4}5zPj~9}U-4XKCA={IlC-rqayGMSq*| zRXB+@9bfMin=+@VfkMADzk<9kkUFxr{I1LM<&Yi#C+O3K5XBd|*!8q!)3X{l=r!9X ze_|17vA5OE6mjcJLJ)A|=pMJfUy`CBEmLDW=cE%8kMu_;KLwQw?Jf23Plk(EN~i%r zlS?7Bpxe?%CmUL`(i>XU;QugR0$~1twfuvn9ycxnMQkTa6|~ z@zO^Dbt{;;*`KhziZ)Yw;znz(+c~WdjOj!y(fifnn~!GvzNc9H$uB+8G?@Wio*rPr zl~J)cDJqZEr2x(esl6}@?VoYO!1vvUC%^YRQ)uVE@e6e8^bljnDNE{X4 z;Cwwk^tHQ`HJ7Pmy)v^?P}GFzpI_a+CFdh{*Y8I>udD=m3lqAKTHaYFyr<>$b&i(} z%(rmL`x+UitJ&l2Qwf?T&ydvQh7Zg$Y?vb)nqM5ICHj=q{_a=CRPJg5`Wn!Y`G@p3 zm7$)R&OFQtbUrnlOoQ>X-sNzA#0N_r@4;$Yl9}ji?<_O@^t8zE$|yaQP`<@n;B5Q9 z*FJsRQCBN-4D}zCVJ5134iPzjnI1 zc`6Y1vC(YiwegzG+Z~#3H)c^X0k!cNQ763Lnoz(>YI)T7_@wlQ3Pl?e_8}bCh|W9s zc9@|=qzVvV_C2~~FjU%V+BSrE=iqEMRW$v3(%+9&GMGeQtd`S~j9-y?iM|eN`Ss z?8njEyB~S%i71uFml>^Iq72i!1z+)8G5b@U9~ICJ4SE+=BUW9(sWLxN+S}w9bmAak z6Fa)}keT{95Hj6XuFZtE=}^~8Zoz{ddlz8a>U0nI8FwTtGK87Fgvjy&cq(8d^N;MG zDJzC~iRwB8q-k;Dgp~H|zs2K?6-NF_PyrHwF0R1T_Dm0gVBvA-hsz^{RT=hoQX;J# zhYu<_6l{-?@kwwXc=FfdK6aZG>EwE31x{QlUbMZd!&ne~Hc*)+y7YtGb(Np|Q3fax zd7)h-^HCaX4JDyxDSGuu;QV)7(4s}|l$=+9;ThU)Qfx-8r@`(P20=ybfb|4IbmY!^ zj6{06oGuDjoT^eXw~IVqoSdhw52{0ieW96kl;r8e(mT}2o72n!2P8e@En}amC#bt& z@6{9&rq({m1-+AaCvXS@mORD^W)VBfBYbw80940l=Gf-cS9`EsJcC&e9{4+ zyB7oBZ?Zxi^ZA{^s$)*8KX55q4W_%(#8lZp zrgk)wqB+bu6f0<+k)!_Y{=uK&FDmTXz3Diq5@77lyh1a_urqTRFSV@dFz7nwHkXfm z4O{r3qJN(#%7Geo;96!<=-{5=5Nho^*ltj^$Z0svLL--9=Z!{{Ah=^qQ)phtp#~iB;dladMd^T$HINwpDqv%)sbY-7wz>K3U>c{)$lz2Rk;uls87VtI~3#5g+>)ozYX zdxS1=fVv-~{MBARL+p9~{$NmPKteURR5a1R0mkJZB_O}1JR*sy95zFaLl_pD^AAx* z@zIi5GZh(^vHF%qU*nJpfmIFoVM2{SDYj`~ah#C9WfXd6`WSIV7hpAz>20OtgCEqP z^r4;WY7^m!#m)IDWz||$#hIydB`$es%~`8GVr5`odCwez=~muWgfOr=EW_B^uufC3 zoFNqQpR@{bbgVYhTlo#p+SvN4O_<_chfKc;x;Ll4V)A#WnhxDpKZVf<|CBWMu+2{g z399k}HKc&ucyvYF_0rXob*S=x3>m8`!Z+SDxK8xD7khQNsQ6fVqN)%kG014vYjI~a zwbu|YPq$F~2dH?M-Z~{t>KWYnT5JQW4zd<)<7iF&E?kunh~8brw$U18s7O6xV=9m&ZsMF#s(MjeIoT~41S*;dbbb#ZuUCq zZkJ2(lrc%7DIGG$^c2XZYy?7(Y)iNnnzSd0dFo5#x zVqVh4oTd2v0w_CN%{NZ@)WSfn5;5yWGY2?(NCO}6+$-Y{5%%!ahWfA<&w#F4?l1H^ z6v@QFbEWvcw*;#f!`0-BQ`e8r9*|@j0eYvA^mcwai0%QEJ%L@Yz*1FneTBnON-;YNmM>_PbA& zral~0ijAhWeGy?;4Uw|vabn7epy?7`{I`*s@|_+^SA55H9s zqmyxJMXcdiu^(>&9u;&bJi6R5*Ptb`ysw5U4i-v{vhvL$sGW9cV?v*8mEu@Fmj!mk z01Kz8*h4TU3|n;+Ybfg*K_Q_0N#p)r@t0;n0n#RQ?Y(1#S`m+??ic-G#1Rj0G;4}= z?TrtWQgMe8C*&btMKwzCig5 z|JqAG+c|z(Xv~Z9drD^W7pH~I!#h`+I2forZ(BgD&Z^&m754~b`b1s-n)h~~fsjlc zR_<_;I#vczQoqct&aHVrqNxX@*0pCF;$7V-Am-0{+k1o1aut4)OTGOrTTsh`i{s!>t_XpZcJ5UK`s=L>Ng74v2-8DYq8B z!w>hE!NmeC_Lt)A9Z;w1)Dx^=7K;(fcV}|HXetq>YS|fDnI6UHiMRyCdSCnbb@RpV zN*k}UA$3C}6f>yd4^;Mg!_4<2faMK=s13{WNbS5%Y)Ok5-r3bUgO{$yAbRp(QJ*Qo z9PE9c+2NLMqS(#RnUF?AX2&~=ky|tt;dEya$@zs=9*+ElKrnS8-sLqLZDjyu;pQC|i zLF#L+zi9o##5+L9{KZGOspW%*-*eR3tBj@9&!3#)x@TKJlywH|5JwqULr(d97`d#}E~v zc?g+{q#*tY#LFk9#dbOHy}G%Idy@yur{TzGnZJY%u%saKLA%sG1{D9*XG5Cgv#Xhq zI$q(^YpVB4tZQkFhXD$34h3>(K_ntf?e|*@#<|lZ zlu^*?>kOkD-OLDKrT;Sr0YgB)Dn1^Ic_bC>qkjs~Q&QoY=Y4;srhw$qZyA+|x^fXA zERo|Q_@{U&9A|09|GWBlClZ=wq+Zm4Fdt5$P3h3_wevzZAldCJ7-}OMe1K+RVHlxv zq;$x%V}rT9dNiISR!D1c7Ky98y5Yl+w(=Q$mZc}SCl7-|-4(|tt{;|yxvu{qsThu|K1+mF4W<%2a14_o+^Jeyc z?b}=}3ky;59N*0RSq(p*{I1AV7!q&3TM7k--E(!o@jPG?i_{EA4B`q?zEttt z%z(pi1TpNgTE#4e2Xda_V#V1=sx=n1bB z%Mx{-+lbLzA-+K9uWR;S7Uf0vOXgg2o&o2Y?QVU-wlVMOcqgs5HD+D($VcQ9{XOz5 zf5WPf2z>_FXo9R(&?Ts2(xeWYW$;od{-?%S&*pe;3XK>`$F$5=?Xau02JncLmTJ-- zz1mrCvmLF<9PV%0qOU~ATqebrGEfz{i$rKR5lzkbc_L_7KvXF=H(bCdE4<7yU#WFMF^rmxCH&`ph`WjtU+ewZ!z2O92@QqnD6x~M$o=I*qb zXTCSv!y;T1)8FqR81vVn6lo+PHUjggC3fiXR>}L=@70>`uX%nBGp5*l?KF-|lqR)w zl?G`RBhMapDB0f3u(Ub6y^tw$T4l9!Y*!+ID6%{*eIVc^V?bZ_`Oa06>EUC8Cs!M^ z2pMZ)Qeh#-OP0UYL;Zd2y~Y}1W_Y0&Pu^6(m%RVzBx~#dg+TQG6av$;C{Tx*0XVD3 zz}BpG7MkVG1SSQ_?ht;k`&PhvA4iaTv2E>86Wpd$2qs%h)^GHwJw1R=e3&thIBhYR zML##;I^FC?WPTf~^bwpnaX&D0Zah7{f94t1+c8|O22@igxKLdFMwl~luPD3tKBnmD z-uC9l8VMz_S&jE%-QxS|!dM^tGc&Qnj?)?k!bqg<3uQl-O<00?FO{^{-kO%HN!xgq zn2Rq9CJhW6>dNK)%a4SA%^G>OsjN=cD*bu^#Kx4`o zE8Xl-30WN`l_F5t(D1Gqk0Y>dzxAaMX5b)UHY9ecyjLVEhLfeW7{P~}IXhU!+3OJB z)l~bY2`qnLH6F9QvaGTOG&oCB)aL|L@c}!zz;G?FOE5WDYv4RD04KO;Fn&=Sxl`e! zX7brLVRfc#v?Z1s=UqhPwBw<{XyY%xHkZC3m^=1H;w5|3<)g9}ABUmhcYC8d+Qe73 z`N-~$BJVeNfVV;vK)*j+#lLZ{Sxa(DiL+HVA{!=Xn`l{L2y1!6iuI)Vw6P^n)vUtF z7@sPzhIy$p8+uTAxX(mwv!MR2xIAsirfmhIQ<;&jG?dT9<(Pj;G27Or_=b?%27mzc N=z-pS#Jwk>{{sb>U7r8| literal 0 HcmV?d00001 diff --git a/docs/pages/Methodology.py b/docs/pages/Methodology.py deleted file mode 100644 index af6a120..0000000 --- a/docs/pages/Methodology.py +++ /dev/null @@ -1,301 +0,0 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) - -download_data() - -st.set_page_config(layout="wide") - -from policyengine_uk_data.utils import get_loss_results -from policyengine_uk_data import ( - FRS_2022_23, - ExtendedFRS_2022_23, - EnhancedFRS_2022_23, - ReweightedFRS_2022_23, -) -from policyengine_core.model_api import Reform -import plotly.express as px -import pandas as pd - -st.title("Methodology") - -st.write( - """ -In this page, we'll walk through step-by-step the process we use to create PolicyEngine's dataset. -* **Family Resources Survey**: we'll start with the FRS, looking at close it is to reality. To take an actual concrete starting point, we'll assume benefit payments are as reported in the survey. -* **FRS (+ tax-benefit model)**: we need to make sure that our tax-benefit model isn't doing anything unexpected. If we turn on simulation of taxes and benefits, does anything look unexpected? If not- great, we've turned a household survey into something useful for policy analysis. We'll also take stock here of what we're missing from reality. -* **Wealth and consumption**: the most obvious thing we're missing is wealth and consumption. We'll impute those here. -* **Fine-tuning**: we'll use reweighting to make some final adjustments to make sure our dataset is as close to reality as possible. -* **Validation**: we'll compare our dataset to the UK's official statistics, and see how we're doing. -""" -) - -st.subheader("Family Resources Survey") - -st.write( - """First, we'll start with the FRS as-is. Skipping over the technical details for how we actually feed this data into the model (you can find that in `policyengine_uk_data/datasets/frs/`), we need to decide how we're actually going to measure 'close to reality'. We need to define an objective function, and if our final dataset improves it a lot, we can call that a success. - -We'll define this objective function using public statistics that we can generally agree are of high importance to describing the UK household sector. These are things that, if the survey gets them wrong, we'd expect to cause inaccuracy in our model, and if we get them all mostly right, we'd expect to have confidence that it's a pretty accurate tax-benefit model. - -For this, we've gone through and collected: - -* **Demographics** from the ONS: ten-year age band populations by region of the UK, national family type populations and national tenure type populations. -* **Incomes** from HMRC: for each of 14 total income bands, the number of people with income and combined income of the seven income types that account for over 99% of total income: employment, self-employment, State Pension, private pension, property, savings interest, and dividends. -* **Tax-benefit programs** from the DWP and OBR: statistics on caseloads, expenditures and revenues for all 20 major tax-benefit programs. - -Let's first take a look at the initial FRS, our starting point, and what is generally considered the best dataset to use (mostly completely un-modified across major tax-benefit models), and see how close it is to reproducing these statistics. - -The table below shows the result, and: it's really quite bad! Look at the relative errors. -""" -) - - -@st.cache_data -def get_loss(dataset, reform, time_period): - loss_results = get_loss_results(dataset, time_period, reform) - - def get_type(name): - if "hmrc" in name: - return "Income" - if "ons" in name: - return "Demographics" - if "obr" in name: - return "Tax-benefit" - return "Other" - - loss_results["type"] = loss_results.name.apply(get_type) - return loss_results - - -reported_benefits = Reform.from_dict( - { - "gov.contrib.policyengine.disable_simulated_benefits": True, - } -) -loss_results = get_loss( - dataset=FRS_2022_23, reform=reported_benefits, time_period=2022 -).copy() -with st.expander(expanded=True, label="Objective function deep dive"): - st.dataframe(loss_results, use_container_width=True) - -st.write( - "It's easier to understand 'what kind of bad' this is by splitting out the statistics into those three categories. Here's a histogram of the absolute relative errors." -) - -fig = px.histogram( - loss_results, - x="abs_rel_error", - nbins=25, - title="Distribution of absolute relative errors", - labels={ - "value": "Absolute relative error", - "count": "Number of variables", - }, - color="type", -) - -st.plotly_chart(fig, use_container_width=True) - -st.write( - """A few notes: - -* We're comparing things in the same relevant time period (2022), and only doing a tiny amount of adjustment to the statistics: OBR statistics are taken directly from the latest EFO, ONS statistics are the most recent projections for 2022, and HMRC statistics are uprated from 2021 to 2022 using the same standard uprating factors we use in the model (and it's only one year adjustment). -* Demogaphics look basically fine: that's expected, because the DWP applies an optimisation algorithm to optimise the household weights to be as close as possible to a similar set of demographic statistics. It's a good sign that we use slightly different statistics than it was trained on and get good accuracy. -* Incomes look *not great at all*. We'll take a closer look below to understand why. But the FRS is well-known to under-report income significantly. -* Tax-benefit programs also look *not good*. And this is a concern! Because we're using this dataset to answer questions about tax-benefit programs, and the FRS isn't even providing a good representation of them under baseline law. -""" -) - -incomes = loss_results[loss_results.type == "Income"] -incomes["band"] = incomes.name.apply( - lambda x: x.split("band_")[1].split("_")[0] -).astype(int) -incomes["count"] = incomes.name.apply(lambda x: "count" in x) -incomes["variable"] = incomes.name.apply( - lambda x: x.split("_income_band")[0].split("_count")[0].split("hmrc/")[-1] -) - -variable = st.selectbox("Select income variable", incomes.variable.unique()) -count = st.checkbox("Count") -variable_df = incomes[ - (incomes.variable == variable) & (incomes["count"] == count) -] - -fig = px.bar( - variable_df, - x="band", - y=[ - "target", - "estimate", - "error", - "rel_error", - "abs_error", - "abs_rel_error", - ], - barmode="group", -) -st.plotly_chart(fig, use_container_width=True) - -st.write( - """There are a few interesting things here: - -* The FRS over-estimates incomes in the upper-middle of the distribution and under-estimates them in the top of the distribution. The reason for this is probably: the FRS misses out the top completely, and then because of the weight optimisation (which scales up the working-age age groups to hit their population targets), the middle of the distribution is inflated, overcompensating. -* Some income types are severely under-estimated across all bands: notably capital incomes. This probably reflects issues with the survey questionnaire design more than sampling bias. -""" -) -st.write("OK, so what can we do about it?") - -st.subheader("FRS (+ tax-benefit model)") - -st.write( - "First, let's turn on the model and check nothing unexpected happens." -) - - -original_frs_loss = loss_results.copy() -frs_loss = get_loss(FRS_2022_23, None, 2022).copy() -combined_frs_loss = pd.merge( - on="name", - left=original_frs_loss, - right=frs_loss, - suffixes=("_original", "_simulated"), -) -combined_frs_loss["change_in_abs_rel_error"] = ( - combined_frs_loss["abs_rel_error_simulated"] - - combined_frs_loss["abs_rel_error_original"] -) -# Sort columns -combined_frs_loss.sort_index(axis=1, inplace=True) -combined_frs_loss = combined_frs_loss.set_index("name") - -st.dataframe(combined_frs_loss, use_container_width=True) - -st.write( - """Again, a few notes: - -* You might be thinking: 'why do some of the HMRC income statistics change?'. That's because of the State Pension, which is simulated in the model. The State Pension is a component of total income, so people might be moved from one income band to another if we adjust their State Pension payments slightly. -* Some of the tax-benefit statistics change, and get better and worse. This is expected for a variety of reasons- one is that incomes and benefits are often out of sync with each other in the data (the income in the survey week might not match income in the benefits assessment time period). -""" -) - -st.subheader("Adding imputations") - -st.write( - """Now, let's add in the imputations for wealth and consumption. For this, we train *quantile regression forests* (essentially, random forest models that capture the conditional distribution of the data) to predict wealth and consumption variables from FRS-shared variables in other surveys. - -The datasets we use are: -* The Wealth and Assets Survey (WAS) for wealth imputations. -* The Living Costs and Food Survey (LCFS) for most consumption imputations. -* The Effects of Taxes and Benefits on Household Income (ETB) for '£ consumption that is full VAT rateable'. For example, different households will have different profiles in terms of the share of their consumption that falls on the VATable items. - -Below is a table showing how just adding these imputations changes our objective statistics (filtered to just rows which changed). Not bad pre-calibrated performance! And we've picked up an extra £200bn in taxes. -""" -) - -new_loss = get_loss(ExtendedFRS_2022_23, None, 2022).copy() -new_loss_against_old = pd.merge( - on="name", - left=frs_loss, - right=new_loss, - suffixes=("_simulated", "_imputed"), -) -new_loss_against_old["change_in_abs_rel_error"] = ( - new_loss_against_old["abs_rel_error_imputed"] - - new_loss_against_old["abs_rel_error_simulated"] -) - -st.dataframe( - new_loss_against_old[ - new_loss_against_old.change_in_abs_rel_error.abs() > 0.01 - ] -) - -st.subheader("Calibration") - -st.write( - "Now, we've got a dataset that's performs pretty well without explicitly targeting the official statistics we care about. So it's time to add the final touch- calibrating the weights to explicitly minimise error against the target set." -) - -calibrated_loss = get_loss(ReweightedFRS_2022_23, None, 2022).copy() -calibrated_loss_against_imputed = pd.merge( - on="name", - left=new_loss, - right=calibrated_loss, - suffixes=("_imputed", "_calibrated"), -) - -calibrated_loss_against_imputed["change_in_abs_rel_error"] = ( - calibrated_loss_against_imputed["abs_rel_error_calibrated"] - - calibrated_loss_against_imputed["abs_rel_error_imputed"] -) - -st.dataframe(calibrated_loss_against_imputed) - -st.write( - "The above table shows what this did to our target set. Mostly, we're hitting targets! But we are still under on income tax and many of the highest income band statistics. Let's take another look at the incomes, but with this new calibrated dataset." -) - -incomes = calibrated_loss[loss_results.type == "Income"] -incomes["band"] = incomes.name.apply( - lambda x: x.split("band_")[1].split("_")[0] -).astype(int) -incomes["count"] = incomes.name.apply(lambda x: "count" in x) -incomes["variable"] = incomes.name.apply( - lambda x: x.split("_income_band")[0].split("_count")[0].split("hmrc/")[-1] -) - -variable = st.selectbox( - "Select income variable", - incomes.variable.unique(), - key=1, -) -count = st.checkbox("Count", key=2) -variable_df = incomes[ - (incomes.variable == variable) & (incomes["count"] == count) -] - -fig = px.bar( - variable_df, - x="band", - y=[ - "target", - "estimate", - "error", - "rel_error", - "abs_error", - "abs_rel_error", - ], - barmode="group", -) -st.plotly_chart(fig, use_container_width=True) - -st.write( - """ -So, what's happening here seems like: the FRS just doesn't have enough high-income records for calibration to work straight away. The optimiser can't just set really high weights for the few rich people we do have, because it'd hurt performance on the demographic statistics. - -So, we need a solution to add more high-income records. What we'll do is: - -* Train a QRF model to predict the distributions of income variables from the Survey of Personal Incomes from FRS demographic variables. -* For each FRS person, add an 'imputed income' clone with zero weight. -* Run the calibration again. -""" -) - -st.subheader("The Enhanced FRS") - -st.write("Let's see how this new dataset performs.") - -efrs_loss = get_loss(EnhancedFRS_2022_23, None, 2022).copy() -efrs_loss_against_calibrated = pd.merge( - on="name", - left=calibrated_loss, - right=efrs_loss, - suffixes=("_calibrated", "_enhanced"), -) -efrs_loss_against_calibrated["change_in_abs_rel_error"] = ( - efrs_loss_against_calibrated["abs_rel_error_enhanced"] - - efrs_loss_against_calibrated["abs_rel_error_calibrated"] -) - -st.dataframe(efrs_loss_against_calibrated) diff --git a/docs/pages/Validation.py b/docs/pages/Validation.py deleted file mode 100644 index 39ea1f2..0000000 --- a/docs/pages/Validation.py +++ /dev/null @@ -1,88 +0,0 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) - -download_data() - -st.set_page_config(layout="wide") - -st.title("Validation") - -from policyengine_uk_data import EnhancedFRS_2022_23, FRS_2022_23, SPI_2020_21 -from policyengine_uk_data.utils.loss import get_loss_results -import pandas as pd - - -@st.cache_data -def get_validation(): - df = pd.DataFrame() - for dataset in [FRS_2022_23, EnhancedFRS_2022_23]: - for year in range(2022, 2029): - print(dataset.label, year) - loss_results = get_loss_results(dataset, year) - loss_results["time_period"] = year - loss_results["dataset"] = dataset.label - df = pd.concat([df, loss_results]) - df = df.reset_index(drop=True) - return df - - -df = get_validation() -truth_df = df[df.dataset == df.dataset.unique()[0]].reset_index() -truth_df["estimate"] = truth_df["target"] -truth_df["error"] = truth_df["estimate"] - truth_df["target"] -truth_df["abs_error"] = truth_df["error"].abs() -truth_df["rel_error"] = truth_df["error"] / truth_df["target"] -truth_df["abs_rel_error"] = truth_df["rel_error"].abs() -truth_df["dataset"] = "Official" -df = pd.concat([df, truth_df]).reset_index(drop=True) - -st.write( - "Calibration check: the table below shows how both the original and enhanced FRS datasets compare to over 2,000 official statistics (which the EFRS was explicitly calibrated to hit) from the OBR, DWP and HMRC." -) - -st.write( - "Since the EFRS is calibrated to these statistics, high performance is expected and achieved." -) - -a, b = st.columns(2) - -with a: - frs_mean = df[df.dataset == "FRS (2022-23)"].abs_rel_error.mean() - st.metric("FRS average error", f"{frs_mean:.2%}") -with b: - efrs_mean = df[df.dataset == "Enhanced FRS (2022-23)"].abs_rel_error.mean() - st.metric("Enhanced FRS average error", f"{efrs_mean:.2%}") - -selected_metrics = st.selectbox("Select statistic", df.name.unique()) -comparison = st.selectbox( - "Select metric", - ["estimate", "error", "abs_error", "rel_error", "abs_rel_error"], -) - -# Bar chart showing datasets and a dotted line for actual - -import plotly.express as px - -comparison_df = ( - df[df.name == selected_metrics] - .groupby(["dataset", "time_period"])[comparison] - .mean() - .reset_index() -) - -fig = px.bar( - comparison_df, - x="time_period", - y=comparison, - color="dataset", - barmode="group", - title=f"{selected_metrics} {comparison} comparison", -) -st.plotly_chart(fig, use_container_width=True) - - -st.dataframe(df) - -st.dataframe(df[df.name == selected_metrics]) diff --git a/docs/utils.py b/docs/utils.py new file mode 100644 index 0000000..d61f829 --- /dev/null +++ b/docs/utils.py @@ -0,0 +1,7 @@ +import plotly.io as pio +from IPython.display import HTML + + +def show(fig): + html = pio.to_html(fig) + return HTML(html) diff --git a/policyengine_uk_data/utils/github.py b/policyengine_uk_data/utils/github.py index 27c88e1..d949ac8 100644 --- a/policyengine_uk_data/utils/github.py +++ b/policyengine_uk_data/utils/github.py @@ -74,7 +74,7 @@ def upload( f"Asset {file_name} already exists in release {release_tag} of {org}/{repo}, skipping." ) return - + url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}" headers = { @@ -85,7 +85,7 @@ def upload( with open(file_path, "rb") as f: data = f.read() - + response = requests.post( url, headers=headers, @@ -96,9 +96,8 @@ def upload( raise ValueError( f"Invalid response code {response.status_code} for url {url}. Received: {response.text}" ) - - return response.json() + return response.json() def set_pr_auto_review_comment(text: str): diff --git a/pyproject.toml b/pyproject.toml index 2ecfff7..cd227f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,11 @@ dev = [ "black", "pytest", "policyengine_uk>=1.8.0", - "streamlit", "survey_enhance", "torch", "tables", + "furo", + "jupyter-book", ] [tool.setuptools]