Skip to content

Commit

Permalink
Capital Gains Tax (and imputations) (#814)
Browse files Browse the repository at this point in the history
* Add initial capital gains progress imputation

* Add gains data and documentation

* Format

* Versioning

* Update dockerfile

* Format
  • Loading branch information
nikhilwoodruff authored Feb 19, 2024
1 parent 02b718c commit b6eefc7
Show file tree
Hide file tree
Showing 18 changed files with 483 additions and 0 deletions.
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Initial version of capital gains imputations and logic.
3 changes: 3 additions & 0 deletions docs/streamlit/.streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[theme]
primaryColor="#2C6496"
font="serif"
7 changes: 7 additions & 0 deletions docs/streamlit/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM python:3.9
RUN pip install policyengine-uk streamlit ipython
WORKDIR /app
COPY . /app
EXPOSE 8501
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
41 changes: 41 additions & 0 deletions docs/streamlit/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import streamlit as st

STYLE = """
<style>
header {
display: none !important;
}
footer {
display: none !important;
}
section > div.block-container {
padding-top: 0px !important;
padding-bottom: 0px !important;
}
html, body, [class*="css"] {
font-family: "Roboto Serif", !important;
font-weight: 500;
}
[data-baseweb="slider"] {
padding-left: 10px !important;
}
#MainMenu {
visibility: hidden;
}
footer {
visibility: hidden;
}
.modebar{
display: none !important;
}
</style>
"""
st.write(STYLE, unsafe_allow_html=True)

st.title("PolicyEngine UK documentation")

st.markdown(
"""
This is the documentation for PolicyEngine UK, an open-source microsimulation model of the UK tax and benefit system.
"""
)
197 changes: 197 additions & 0 deletions docs/streamlit/pages/Capital_Gains_Tax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import streamlit as st
import pandas as pd
from microdf import MicroDataFrame
import numpy as np
import plotly.express as px
from policyengine_core.charts import format_fig, BLUE, BLUE_LIGHT
from Home import STYLE
from policyengine_uk.data.storage import STORAGE_FOLDER

# st.set_page_config(layout="wide")

st.write(STYLE, unsafe_allow_html=True)
st.title("Capital Gains Tax")

st.markdown(
"""This page documents PolicyEngine's in-progress capital gains imputations in the PolicyEngine UK microsimulation model."""
)

st.subheader("Method")

st.markdown(
"""Our input data consists of: PolicyEngine's Enhanced FRS (incorporating WAS, LCFS, SPI and ONS/OBR summary data), and joint capital gains-taxable income data from [CAGE working paper no. 465, *Capital Gains and UK Inequality* (Arun Advani, Andy Summers)](https://warwick.ac.uk/fac/soc/economics/research/centres/cage/manage/publications/wp465.2020.pdf).
This data includes p05, p10, p25, p50, p75, p90, and p95 percentiles of capital gains (given gains != 0) as well as the percentage with gains for each of over 60 income bands. We fit a spline to each income band's percentiles, and use these splines to impute capital gains for each individual in the microsimulation model as an initial approach.
The below figure is interactive and shows the fitted spline for each income band.
"""
)

st.warning(
"**Caveat:** so far, we've only used income bands up to over £128,000, so won't capture the very highest earners."
)

capital_gains = pd.read_csv(
STORAGE_FOLDER
/ "imputations"
/ "capital_gains_distribution_advani_summers.csv.gz"
)
capital_gains["maximum_total_income"] = (
capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
)
# Fit a spline to each income band's percentiles
from scipy.interpolate import UnivariateSpline

splines = {}

for i in range(len(capital_gains)):
row = capital_gains.iloc[i]
splines[row.minimum_total_income] = UnivariateSpline(
[0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
[row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
k=2,
)

with st.expander("Capital gains-income joint distribution input data"):
st.dataframe(capital_gains)

with st.expander("Capital gains-income joint distribution fitted splines"):
income_band = st.select_slider(
"Income band",
capital_gains.minimum_total_income,
format_func=lambda x: f"£{x:,.0f}",
)

fig = (
px.line(
x=np.linspace(0, 1, 100),
y=splines[income_band](np.linspace(0, 1, 100)),
)
.update_layout(
title="Percentiles of capital gains",
yaxis_title="Capital gains",
xaxis_title="Percentile",
yaxis_tickformat=",.0f",
yaxis_tickprefix="£",
xaxis_tickformat=".0%",
yaxis_range=[capital_gains.p05.min(), capital_gains.p95.max()],
)
.update_traces(line=dict(color=BLUE))
)

st.plotly_chart(format_fig(fig), use_container_width=True)

from tqdm import tqdm
from policyengine_uk.system import system

cgt_revenue = system.parameters.calibration.programs.capital_gains.total

lower_income_bounds = list(splines)
uprating_from_2017 = cgt_revenue("2023-01-01") / cgt_revenue("2017-01-01")


def impute_capital_gains(total_income: float) -> float:
if total_income < 0:
return 0
distribution_row = capital_gains[
(capital_gains["minimum_total_income"] <= total_income)
& (capital_gains["maximum_total_income"] > total_income)
]
percent_with_gains = distribution_row["percent_with_gains"].values[0]
has_gains = np.random.choice(
[0, 1], p=[1 - percent_with_gains, percent_with_gains]
)
if not has_gains:
return 0
for i in range(len(splines)):
if lower_income_bounds[i] > total_income:
continue
i -= 1
sample_percentile = np.random.random()
spline = splines[lower_income_bounds[i]]
return spline(sample_percentile) * uprating_from_2017


imputed_gains = []

st.markdown(
"""Then, for every household in the model, we randomly sample their probability of gains according to the capital gains statistics, and sample a random quantile from the relevant income band's fitted spline to determine the amount if they are imputed to have gains. You can run this process on individual income data inputs below."""
)

with st.expander("Capital gains imputation test runner"):

income = st.slider("Total income", 0, 500000, 50000, 1000)

with st.spinner("Imputing capital gains..."):
capital_gains = [impute_capital_gains(income) for _ in range(100)]

fig = (
px.histogram(x=capital_gains, nbins=10)
.update_layout(
title="Imputed capital gains",
xaxis_title="Capital gains",
yaxis_title="Frequency",
xaxis_tickformat=",.0f",
xaxis_tickprefix="£",
xaxis_range=[0, 1_000_000],
)
.update_traces(marker=dict(color=BLUE))
)

st.plotly_chart(format_fig(fig), use_container_width=True)

st.subheader("Analysis")

st.markdown(
"""We can use the imputed capital gains to analyse the distribution of capital gains in the model. The below figure shows the joint distribution of total income and capital gains as a scatter plot."""
)

st.warning("**Again**- in progress.")


@st.cache_resource
def get_microsimulation():
from policyengine_uk import Microsimulation

sim = Microsimulation()
sim.calculate("household_net_income")
return sim


sim = get_microsimulation()

col1, col2 = st.columns(2)

with col1:
st.metric(
"Total capital gains",
f"£{sim.calculate('capital_gains').sum()/1e9:.1f}bn",
)

with col2:
st.metric(
"Total CGT revenue",
f"£{sim.calculate('capital_gains_tax').sum()/1e9:.1f}bn",
)


with st.expander("PolicyEngine UK capital gains-income joint distribution"):
fig = (
px.scatter(
x=sim.calculate("total_income"),
y=sim.calculate("capital_gains"),
opacity=0.1,
)
.update_traces(line=dict(color=BLUE))
.update_layout(
title="PolicyEngine UK capital gains-income joint distribution",
xaxis_title="Total income",
yaxis_title="Capital gains",
xaxis_tickformat=",.0f",
xaxis_tickprefix="£",
yaxis_tickformat=",.0f",
yaxis_tickprefix="£",
)
)

st.plotly_chart(format_fig(fig), use_container_width=True)
72 changes: 72 additions & 0 deletions policyengine_uk/data/datasets/frs/imputations/capital_gains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd
import numpy as np

# Fit a spline to each income band's percentiles
from scipy.interpolate import UnivariateSpline
from policyengine_uk import Microsimulation
from tqdm import tqdm
from policyengine_uk.system import system
from policyengine_uk.data.storage import STORAGE_FOLDER

capital_gains = pd.read_csv(
STORAGE_FOLDER
/ "imputations"
/ "capital_gains_distribution_advani_summers.csv.gz"
)
capital_gains["maximum_total_income"] = (
capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
)


splines = {}

for i in range(len(capital_gains)):
row = capital_gains.iloc[i]
splines[row.minimum_total_income] = UnivariateSpline(
[0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
[row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
k=1,
)


sim = Microsimulation()

total_income = sim.calculate("total_income", 2023)
cgt_revenue = system.parameters.calibration.programs.capital_gains.total

lower_income_bounds = list(splines)
uprating_from_2017 = cgt_revenue("2023-01-01") / cgt_revenue("2017-01-01")


def impute_capital_gains(total_income: float, age: float) -> float:
if total_income < 0 or age < 18:
return 0
distribution_row = capital_gains[
(capital_gains["minimum_total_income"] <= total_income)
& (capital_gains["maximum_total_income"] > total_income)
]
percent_with_gains = distribution_row["percent_with_gains"].values[0]
has_gains = np.random.choice(
[0, 1], p=[1 - percent_with_gains, percent_with_gains]
)
if not has_gains:
return 0
sample_percentile = np.random.random()
for i in range(len(splines)):
if lower_income_bounds[i] > total_income:
continue
i -= 1
spline = splines[lower_income_bounds[i]]
return spline(sample_percentile) * uprating_from_2017


if __name__ == "__main__":
imputed_gains = []
for income, age in tqdm(
list(zip(total_income, sim.calculate("age", 2023)))
):
imputed_gains.append(impute_capital_gains(income, age))

pd.DataFrame({"imputed_gains": imputed_gains}).to_csv(
STORAGE_FOLDER / "imputations" / "imputed_gains.csv.gz", index=False
)
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
description: Capital gains tax revenue.
values:
1999-01-01: 2_122_000_000
2000-01-01: 3_236_000_000
2001-01-01: 3_034_000_000
2002-01-01: 1_596_000_000
2003-01-01: 2_225_000_000
2004-01-01: 2_282_000_000
2005-01-01: 3_042_000_000
2006-01-01: 3_830_000_000
2007-01-01: 5_268_000_000
2008-01-01: 7_852_000_000
2009-01-01: 2_491_000_000
2010-01-01: 3_601_000_000
2011-01-01: 4_337_000_000
2012-01-01: 3_927_000_000
2013-01-01: 3_908_000_000
2014-01-01: 5_559_000_000
2015-01-01: 7_060_000_000
2016-01-01: 8_561_000_000
2017-01-01: 7_793_000_000
2018-01-01: 9_191_000_000
2019-01-01: 9_827_000_000
2020-01-01: 11_131_000_000
2021-01-01: 15_267_000_000
2022-01-01: 18_077_057_790
# OBR Forecast
2023-01-01: 17_759_351_662
2024-01-01: 19_512_453_309
2025-01-01: 21_164_830_357
2026-01-01: 23_383_475_972
2027-01-01: 26_144_242_482

metadata:
unit: currency-GBP
label: Capital Gains Tax revenue
reference:
- title: Capital Gains Tax | OBR
href: https://obr.uk/forecasts-in-depth/tax-by-tax-spend-by-spend/capital-gains-tax
Loading

0 comments on commit b6eefc7

Please sign in to comment.