Capital Gains Tax (and imputations) (#814)

* Add initial capital gains progress imputation * Add gains data and documentation * Format * Versioning * Update dockerfile * Format
PolicyEngine · Feb 19, 2024 · b6eefc7 · b6eefc7
1 parent 02b718c
commit b6eefc7
Show file tree

Hide file tree

Showing 18 changed files with 483 additions and 0 deletions.
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Initial version of capital gains imputations and logic.
diff --git a/docs/streamlit/.streamlit/config.toml b/docs/streamlit/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[theme]
+primaryColor="#2C6496"
+font="serif"
diff --git a/docs/streamlit/Dockerfile b/docs/streamlit/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.9
+RUN pip install policyengine-uk streamlit ipython
+WORKDIR /app
+COPY . /app
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/docs/streamlit/Home.py b/docs/streamlit/Home.py
@@ -0,0 +1,41 @@
+import streamlit as st
+
+STYLE = """
+<style>
+header {
+    display: none !important;
+}
+footer {
+    display: none !important;
+}
+section > div.block-container {
+    padding-top: 0px !important;
+    padding-bottom: 0px !important;
+}
+html, body, [class*="css"] {
+  font-family: "Roboto Serif", !important;
+  font-weight: 500;
+}
+[data-baseweb="slider"] {
+    padding-left: 10px !important;
+}
+#MainMenu {
+    visibility: hidden;
+}
+footer {
+    visibility: hidden;
+}
+.modebar{
+      display: none !important;
+}
+</style>
+"""
+st.write(STYLE, unsafe_allow_html=True)
+
+st.title("PolicyEngine UK documentation")
+
+st.markdown(
+    """
+    This is the documentation for PolicyEngine UK, an open-source microsimulation model of the UK tax and benefit system.
+    """
+)
diff --git a/docs/streamlit/pages/Capital_Gains_Tax.py b/docs/streamlit/pages/Capital_Gains_Tax.py
@@ -0,0 +1,197 @@
+import streamlit as st
+import pandas as pd
+from microdf import MicroDataFrame
+import numpy as np
+import plotly.express as px
+from policyengine_core.charts import format_fig, BLUE, BLUE_LIGHT
+from Home import STYLE
+from policyengine_uk.data.storage import STORAGE_FOLDER
+
+# st.set_page_config(layout="wide")
+
+st.write(STYLE, unsafe_allow_html=True)
+st.title("Capital Gains Tax")
+
+st.markdown(
+    """This page documents PolicyEngine's in-progress capital gains imputations in the PolicyEngine UK microsimulation model."""
+)
+
+st.subheader("Method")
+
+st.markdown(
+    """Our input data consists of: PolicyEngine's Enhanced FRS (incorporating WAS, LCFS, SPI and ONS/OBR summary data), and joint capital gains-taxable income data from [CAGE working paper no. 465, *Capital Gains and UK Inequality* (Arun Advani, Andy Summers)](https://warwick.ac.uk/fac/soc/economics/research/centres/cage/manage/publications/wp465.2020.pdf).
+         
+This data includes p05, p10, p25, p50, p75, p90, and p95 percentiles of capital gains (given gains != 0) as well as the percentage with gains for each of over 60 income bands. We fit a spline to each income band's percentiles, and use these splines to impute capital gains for each individual in the microsimulation model as an initial approach.
+
+The below figure is interactive and shows the fitted spline for each income band.
+         """
+)
+
+st.warning(
+    "**Caveat:** so far, we've only used income bands up to over £128,000, so won't capture the very highest earners."
+)
+
+capital_gains = pd.read_csv(
+    STORAGE_FOLDER
+    / "imputations"
+    / "capital_gains_distribution_advani_summers.csv.gz"
+)
+capital_gains["maximum_total_income"] = (
+    capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
+)
+# Fit a spline to each income band's percentiles
+from scipy.interpolate import UnivariateSpline
+
+splines = {}
+
+for i in range(len(capital_gains)):
+    row = capital_gains.iloc[i]
+    splines[row.minimum_total_income] = UnivariateSpline(
+        [0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
+        [row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
+        k=2,
+    )
+
+with st.expander("Capital gains-income joint distribution input data"):
+    st.dataframe(capital_gains)
+
+with st.expander("Capital gains-income joint distribution fitted splines"):
+    income_band = st.select_slider(
+        "Income band",
+        capital_gains.minimum_total_income,
+        format_func=lambda x: f"£{x:,.0f}",
+    )
+
+    fig = (
+        px.line(
+            x=np.linspace(0, 1, 100),
+            y=splines[income_band](np.linspace(0, 1, 100)),
+        )
+        .update_layout(
+            title="Percentiles of capital gains",
+            yaxis_title="Capital gains",
+            xaxis_title="Percentile",
+            yaxis_tickformat=",.0f",
+            yaxis_tickprefix="£",
+            xaxis_tickformat=".0%",
+            yaxis_range=[capital_gains.p05.min(), capital_gains.p95.max()],
+        )
+        .update_traces(line=dict(color=BLUE))
+    )
+
+    st.plotly_chart(format_fig(fig), use_container_width=True)
+
+from tqdm import tqdm
+from policyengine_uk.system import system
+
+cgt_revenue = system.parameters.calibration.programs.capital_gains.total
+
+lower_income_bounds = list(splines)
+uprating_from_2017 = cgt_revenue("2023-01-01") / cgt_revenue("2017-01-01")
+
+
+def impute_capital_gains(total_income: float) -> float:
+    if total_income < 0:
+        return 0
+    distribution_row = capital_gains[
+        (capital_gains["minimum_total_income"] <= total_income)
+        & (capital_gains["maximum_total_income"] > total_income)
+    ]
+    percent_with_gains = distribution_row["percent_with_gains"].values[0]
+    has_gains = np.random.choice(
+        [0, 1], p=[1 - percent_with_gains, percent_with_gains]
+    )
+    if not has_gains:
+        return 0
+    for i in range(len(splines)):
+        if lower_income_bounds[i] > total_income:
+            continue
+    i -= 1
+    sample_percentile = np.random.random()
+    spline = splines[lower_income_bounds[i]]
+    return spline(sample_percentile) * uprating_from_2017
+
+
+imputed_gains = []
+
+st.markdown(
+    """Then, for every household in the model, we randomly sample their probability of gains according to the capital gains statistics, and sample a random quantile from the relevant income band's fitted spline to determine the amount if they are imputed to have gains. You can run this process on individual income data inputs below."""
+)
+
+with st.expander("Capital gains imputation test runner"):
+
+    income = st.slider("Total income", 0, 500000, 50000, 1000)
+
+    with st.spinner("Imputing capital gains..."):
+        capital_gains = [impute_capital_gains(income) for _ in range(100)]
+
+    fig = (
+        px.histogram(x=capital_gains, nbins=10)
+        .update_layout(
+            title="Imputed capital gains",
+            xaxis_title="Capital gains",
+            yaxis_title="Frequency",
+            xaxis_tickformat=",.0f",
+            xaxis_tickprefix="£",
+            xaxis_range=[0, 1_000_000],
+        )
+        .update_traces(marker=dict(color=BLUE))
+    )
+
+    st.plotly_chart(format_fig(fig), use_container_width=True)
+
+st.subheader("Analysis")
+
+st.markdown(
+    """We can use the imputed capital gains to analyse the distribution of capital gains in the model. The below figure shows the joint distribution of total income and capital gains as a scatter plot."""
+)
+
+st.warning("**Again**- in progress.")
+
+
+@st.cache_resource
+def get_microsimulation():
+    from policyengine_uk import Microsimulation
+
+    sim = Microsimulation()
+    sim.calculate("household_net_income")
+    return sim
+
+
+sim = get_microsimulation()
+
+col1, col2 = st.columns(2)
+
+with col1:
+    st.metric(
+        "Total capital gains",
+        f"£{sim.calculate('capital_gains').sum()/1e9:.1f}bn",
+    )
+
+with col2:
+    st.metric(
+        "Total CGT revenue",
+        f"£{sim.calculate('capital_gains_tax').sum()/1e9:.1f}bn",
+    )
+
+
+with st.expander("PolicyEngine UK capital gains-income joint distribution"):
+    fig = (
+        px.scatter(
+            x=sim.calculate("total_income"),
+            y=sim.calculate("capital_gains"),
+            opacity=0.1,
+        )
+        .update_traces(line=dict(color=BLUE))
+        .update_layout(
+            title="PolicyEngine UK capital gains-income joint distribution",
+            xaxis_title="Total income",
+            yaxis_title="Capital gains",
+            xaxis_tickformat=",.0f",
+            xaxis_tickprefix="£",
+            yaxis_tickformat=",.0f",
+            yaxis_tickprefix="£",
+        )
+    )
+
+    st.plotly_chart(format_fig(fig), use_container_width=True)
diff --git a/policyengine_uk/data/datasets/frs/imputations/capital_gains.py b/policyengine_uk/data/datasets/frs/imputations/capital_gains.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+
+# Fit a spline to each income band's percentiles
+from scipy.interpolate import UnivariateSpline
+from policyengine_uk import Microsimulation
+from tqdm import tqdm
+from policyengine_uk.system import system
+from policyengine_uk.data.storage import STORAGE_FOLDER
+
+capital_gains = pd.read_csv(
+    STORAGE_FOLDER
+    / "imputations"
+    / "capital_gains_distribution_advani_summers.csv.gz"
+)
+capital_gains["maximum_total_income"] = (
+    capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
+)
+
+
+splines = {}
+
+for i in range(len(capital_gains)):
+    row = capital_gains.iloc[i]
+    splines[row.minimum_total_income] = UnivariateSpline(
+        [0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
+        [row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
+        k=1,
+    )
+
+
+sim = Microsimulation()
+
+total_income = sim.calculate("total_income", 2023)
+cgt_revenue = system.parameters.calibration.programs.capital_gains.total
+
+lower_income_bounds = list(splines)
+uprating_from_2017 = cgt_revenue("2023-01-01") / cgt_revenue("2017-01-01")
+
+
+def impute_capital_gains(total_income: float, age: float) -> float:
+    if total_income < 0 or age < 18:
+        return 0
+    distribution_row = capital_gains[
+        (capital_gains["minimum_total_income"] <= total_income)
+        & (capital_gains["maximum_total_income"] > total_income)
+    ]
+    percent_with_gains = distribution_row["percent_with_gains"].values[0]
+    has_gains = np.random.choice(
+        [0, 1], p=[1 - percent_with_gains, percent_with_gains]
+    )
+    if not has_gains:
+        return 0
+    sample_percentile = np.random.random()
+    for i in range(len(splines)):
+        if lower_income_bounds[i] > total_income:
+            continue
+    i -= 1
+    spline = splines[lower_income_bounds[i]]
+    return spline(sample_percentile) * uprating_from_2017
+
+
+if __name__ == "__main__":
+    imputed_gains = []
+    for income, age in tqdm(
+        list(zip(total_income, sim.calculate("age", 2023)))
+    ):
+        imputed_gains.append(impute_capital_gains(income, age))
+
+    pd.DataFrame({"imputed_gains": imputed_gains}).to_csv(
+        STORAGE_FOLDER / "imputations" / "imputed_gains.csv.gz", index=False
+    )
diff --git a/policyengine_uk/data/storage/imputations/capital_gains_distribution_advani_summers.csv.gz b/policyengine_uk/data/storage/imputations/capital_gains_distribution_advani_summers.csv.gz
diff --git a/policyengine_uk/data/storage/imputations/imputed_gains.csv.gz b/policyengine_uk/data/storage/imputations/imputed_gains.csv.gz
diff --git a/policyengine_uk/parameters/calibration/programs/capital_gains/tax.yaml b/policyengine_uk/parameters/calibration/programs/capital_gains/tax.yaml
@@ -0,0 +1,39 @@
+description: Capital gains tax revenue.
+values:
+  1999-01-01: 2_122_000_000
+  2000-01-01: 3_236_000_000
+  2001-01-01: 3_034_000_000
+  2002-01-01: 1_596_000_000
+  2003-01-01: 2_225_000_000
+  2004-01-01: 2_282_000_000
+  2005-01-01: 3_042_000_000
+  2006-01-01: 3_830_000_000
+  2007-01-01: 5_268_000_000
+  2008-01-01: 7_852_000_000
+  2009-01-01: 2_491_000_000
+  2010-01-01: 3_601_000_000
+  2011-01-01: 4_337_000_000
+  2012-01-01: 3_927_000_000
+  2013-01-01: 3_908_000_000
+  2014-01-01: 5_559_000_000
+  2015-01-01: 7_060_000_000
+  2016-01-01: 8_561_000_000
+  2017-01-01: 7_793_000_000
+  2018-01-01: 9_191_000_000
+  2019-01-01: 9_827_000_000
+  2020-01-01: 11_131_000_000
+  2021-01-01: 15_267_000_000
+  2022-01-01: 18_077_057_790
+  # OBR Forecast
+  2023-01-01: 17_759_351_662
+  2024-01-01: 19_512_453_309
+  2025-01-01: 21_164_830_357
+  2026-01-01: 23_383_475_972
+  2027-01-01: 26_144_242_482
+
+metadata:
+  unit: currency-GBP
+  label: Capital Gains Tax revenue
+  reference:
+    - title: Capital Gains Tax | OBR
+      href: https://obr.uk/forecasts-in-depth/tax-by-tax-spend-by-spend/capital-gains-tax