From 7d0ee839c4185d98f2471e6884d3957b7675f5bf Mon Sep 17 00:00:00 2001 From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:18:55 +0100 Subject: [PATCH] Update France data for Chartbook (#2492) --- dag/poverty_inequality.yml | 49 +++++ .../chartbook/2024-04-22/concialdi.meta.yml | 29 +++ .../garden/chartbook/2024-04-22/concialdi.py | 30 +++ .../2024-05-23/wealth_france.meta.yml | 57 +++++ .../chartbook/2024-05-23/wealth_france.py | 34 +++ .../inequality_france_1999.meta.yml | 33 +++ .../2024-03-21/inequality_france_1999.py | 32 +++ .../2024-04-05/inequality_france.meta.yml | 35 +++ .../insee/2024-04-05/inequality_france.py | 62 ++++++ .../2024-04-25/insee_premiere_1875.meta.yml | 174 +++++++++++++++ .../insee/2024-04-25/insee_premiere_1875.py | 54 +++++ .../relative_poverty_france.meta.yml | 49 +++++ .../2024-04-26/relative_poverty_france.py | 54 +++++ .../2024-05-23/interdecile_ratio.meta.yml | 34 +++ .../insee/2024-05-23/interdecile_ratio.py | 38 ++++ .../interdecile_ratio_2022.meta.yml | 34 +++ .../2024-05-23/interdecile_ratio_2022.py | 38 ++++ ..._income_inequality_database.countries.json | 203 ++++++++++++++++++ .../world_income_inequality_database.meta.yml | 31 +++ .../world_income_inequality_database.py | 55 +++++ .../meadow/chartbook/2024-04-22/concialdi.py | 29 +++ .../chartbook/2024-05-23/wealth_france.py | 35 +++ .../2024-03-21/inequality_france_1999.py | 32 +++ .../insee/2024-04-05/inequality_france.py | 40 ++++ .../insee/2024-04-25/insee_premiere_1875.py | 124 +++++++++++ .../2024-04-26/relative_poverty_france.py | 77 +++++++ .../insee/2024-05-23/interdecile_ratio.py | 47 ++++ .../2024-05-23/interdecile_ratio_2022.py | 50 +++++ .../world_income_inequality_database.py | 43 ++++ .../chartbook/2024-04-22/concialdi.csv.dvc | 28 +++ snapshots/chartbook/2024-04-22/concialdi.py | 33 +++ .../2024-05-23/wealth_france.csv.dvc | 28 +++ .../chartbook/2024-05-23/wealth_france.py | 29 +++ .../2024-03-21/inequality_france_1999.csv.dvc | 33 +++ .../2024-03-21/inequality_france_1999.py | 29 +++ .../insee/2024-04-05/inequality_france.py | 24 +++ .../2024-04-05/inequality_france.xlsx.dvc | 30 +++ .../insee/2024-04-25/insee_premiere_1875.py | 24 +++ .../2024-04-25/insee_premiere_1875.xlsx.dvc | 30 +++ .../2024-04-26/relative_poverty_france.py | 24 +++ .../relative_poverty_france.xlsx.dvc | 30 +++ .../2024-05-23/interdecile_ratio.csv.dvc | 29 +++ .../insee/2024-05-23/interdecile_ratio.py | 34 +++ .../2024-05-23/interdecile_ratio_2022.csv.dvc | 29 +++ .../2024-05-23/interdecile_ratio_2022.py | 30 +++ .../world_income_inequality_database.py | 24 +++ .../world_income_inequality_database.xlsx.dvc | 29 +++ 47 files changed, 2119 insertions(+) create mode 100644 etl/steps/data/garden/chartbook/2024-04-22/concialdi.meta.yml create mode 100644 etl/steps/data/garden/chartbook/2024-04-22/concialdi.py create mode 100644 etl/steps/data/garden/chartbook/2024-05-23/wealth_france.meta.yml create mode 100644 etl/steps/data/garden/chartbook/2024-05-23/wealth_france.py create mode 100644 etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.py create mode 100644 etl/steps/data/garden/insee/2024-04-05/inequality_france.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-04-05/inequality_france.py create mode 100644 etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.py create mode 100644 etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.py create mode 100644 etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.py create mode 100644 etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.meta.yml create mode 100644 etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.py create mode 100644 etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.countries.json create mode 100644 etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.meta.yml create mode 100644 etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.py create mode 100644 etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py create mode 100644 etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py create mode 100644 etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py create mode 100644 etl/steps/data/meadow/insee/2024-04-05/inequality_france.py create mode 100644 etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py create mode 100644 etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py create mode 100644 etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio.py create mode 100644 etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio_2022.py create mode 100644 etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py create mode 100644 snapshots/chartbook/2024-04-22/concialdi.csv.dvc create mode 100644 snapshots/chartbook/2024-04-22/concialdi.py create mode 100644 snapshots/chartbook/2024-05-23/wealth_france.csv.dvc create mode 100644 snapshots/chartbook/2024-05-23/wealth_france.py create mode 100644 snapshots/insee/2024-03-21/inequality_france_1999.csv.dvc create mode 100644 snapshots/insee/2024-03-21/inequality_france_1999.py create mode 100644 snapshots/insee/2024-04-05/inequality_france.py create mode 100644 snapshots/insee/2024-04-05/inequality_france.xlsx.dvc create mode 100644 snapshots/insee/2024-04-25/insee_premiere_1875.py create mode 100644 snapshots/insee/2024-04-25/insee_premiere_1875.xlsx.dvc create mode 100644 snapshots/insee/2024-04-26/relative_poverty_france.py create mode 100644 snapshots/insee/2024-04-26/relative_poverty_france.xlsx.dvc create mode 100644 snapshots/insee/2024-05-23/interdecile_ratio.csv.dvc create mode 100644 snapshots/insee/2024-05-23/interdecile_ratio.py create mode 100644 snapshots/insee/2024-05-23/interdecile_ratio_2022.csv.dvc create mode 100644 snapshots/insee/2024-05-23/interdecile_ratio_2022.py create mode 100644 snapshots/unu_wider/2024-04-22/world_income_inequality_database.py create mode 100644 snapshots/unu_wider/2024-04-22/world_income_inequality_database.xlsx.dvc diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index 554926aaa13..e5351cf7760 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -118,3 +118,52 @@ steps: data://grapher/oecd/2024-04-30/affordable_housing_database: - data://garden/oecd/2024-04-30/affordable_housing_database + # Inequality data from France (INSEE) + # 1999 report + data://meadow/insee/2024-03-21/inequality_france_1999: + - snapshot://insee/2024-03-21/inequality_france_1999.csv + data://garden/insee/2024-03-21/inequality_france_1999: + - data://meadow/insee/2024-03-21/inequality_france_1999 + # 2021 report + data://meadow/insee/2024-04-25/insee_premiere_1875: + - snapshot://insee/2024-04-25/insee_premiere_1875.xlsx + data://garden/insee/2024-04-25/insee_premiere_1875: + - data://meadow/insee/2024-04-25/insee_premiere_1875 + # Key figures (inequality) - live version + data://meadow/insee/2024-04-05/inequality_france: + - snapshot://insee/2024-04-05/inequality_france.xlsx + data://garden/insee/2024-04-05/inequality_france: + - data://meadow/insee/2024-04-05/inequality_france + # Key figures (relative poverty) - live version + data://meadow/insee/2024-04-26/relative_poverty_france: + - snapshot://insee/2024-04-26/relative_poverty_france.xlsx + data://garden/insee/2024-04-26/relative_poverty_france: + - data://meadow/insee/2024-04-26/relative_poverty_france + # Interdecile ratio for net salary (version 2022) + data://meadow/insee/2024-05-23/interdecile_ratio_2022: + - snapshot://insee/2024-05-23/interdecile_ratio_2022.csv + data://garden/insee/2024-05-23/interdecile_ratio_2022: + - data://meadow/insee/2024-05-23/interdecile_ratio_2022 + # Interdecile ratio for net salary (live version) + data://meadow/insee/2024-05-23/interdecile_ratio: + - snapshot://insee/2024-05-23/interdecile_ratio.csv + data://garden/insee/2024-05-23/interdecile_ratio: + - data://meadow/insee/2024-05-23/interdecile_ratio + + # France historical Ginis (Concialdi, 1997) + data://meadow/chartbook/2024-04-22/concialdi: + - snapshot://chartbook/2024-04-22/concialdi.csv + data://garden/chartbook/2024-04-22/concialdi: + - data://meadow/chartbook/2024-04-22/concialdi + + # UNU-WIDER World Income Inequality Database (WIID) + data://meadow/unu_wider/2024-04-22/world_income_inequality_database: + - snapshot://unu_wider/2024-04-22/world_income_inequality_database.xlsx + data://garden/unu_wider/2024-04-22/world_income_inequality_database: + - data://meadow/unu_wider/2024-04-22/world_income_inequality_database + + # Wealth inequalith in France (Piketty et al. 2006) + data://meadow/chartbook/2024-05-23/wealth_france: + - snapshot://chartbook/2024-05-23/wealth_france.csv + data://garden/chartbook/2024-05-23/wealth_france: + - data://meadow/chartbook/2024-05-23/wealth_france diff --git a/etl/steps/data/garden/chartbook/2024-04-22/concialdi.meta.yml b/etl/steps/data/garden/chartbook/2024-04-22/concialdi.meta.yml new file mode 100644 index 00000000000..e6aa4141f17 --- /dev/null +++ b/etl/steps/data/garden/chartbook/2024-04-22/concialdi.meta.yml @@ -0,0 +1,29 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + presentation: + topic_tags: + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + concialdi: + variables: + gini: + title: Gini index + unit: "" + short_unit: "" + description_short: The [Gini index](#dod:gini) measures inequality on a scale from 0 to 100. Higher values indicate higher inequality. Inequality is measured here in terms of income before taxes and after benefits. + presentation: + title_public: Gini index + display: + name: Gini index + numDecimalPlaces: 2 + tolerance: 5 diff --git a/etl/steps/data/garden/chartbook/2024-04-22/concialdi.py b/etl/steps/data/garden/chartbook/2024-04-22/concialdi.py new file mode 100644 index 00000000000..3ef2a52f05e --- /dev/null +++ b/etl/steps/data/garden/chartbook/2024-04-22/concialdi.py @@ -0,0 +1,30 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("concialdi") + + # Read table from meadow dataset. + tb = ds_meadow["concialdi"].reset_index() + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.meta.yml b/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.meta.yml new file mode 100644 index 00000000000..c458730a754 --- /dev/null +++ b/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.meta.yml @@ -0,0 +1,57 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + description_key: + - Data is estimated from wealth concentration at death, based on estate tax returns. + - Data before 1902 has been estimated by sampling decedents in the Paris region. + presentation: + topic_tags: + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + wealth_france: + variables: + p90p100_share: + title: Richest decile - Wealth share + unit: "%" + short_unit: "%" + description_short: The share of wealth owned by the richest decile (tenth of the population). + presentation: + title_public: Wealth share of the richest 10% + display: + name: Wealth share of the richest 10% + numDecimalPlaces: 1 + tolerance: 5 + + p99p100_share: + title: Top 1% - Wealth share + unit: "%" + short_unit: "%" + description_short: The share of wealth owned by the richest 1%. + presentation: + title_public: Wealth share of the richest 1% + display: + name: Wealth share of the richest 1% + numDecimalPlaces: 1 + tolerance: 5 + + p99_9p100_share: + title: Top 0.1% - Wealth share + unit: "%" + short_unit: "%" + description_short: The share of wealth owned by the richest 0.1%. + presentation: + title_public: Wealth share of the richest 0.1% + display: + name: Wealth share of the richest 0.1% + numDecimalPlaces: 1 + tolerance: 5 + diff --git a/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.py b/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.py new file mode 100644 index 00000000000..f64cf3865ed --- /dev/null +++ b/etl/steps/data/garden/chartbook/2024-05-23/wealth_france.py @@ -0,0 +1,34 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("wealth_france") + + # Read table from meadow dataset. + tb = ds_meadow["wealth_france"].reset_index() + + # + # Process data. + # + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.meta.yml b/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.meta.yml new file mode 100644 index 00000000000..3dd6a33ef4c --- /dev/null +++ b/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.meta.yml @@ -0,0 +1,33 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + inequality_france_1999: + variables: + gini: + title: Gini coefficient + unit: "" + short_unit: "" + description_short: The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. + description_key: + - Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + - Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + processing_level: minor + presentation: + title_public: Gini coefficient + display: + name: Gini coefficient + numDecimalPlaces: 2 + tolerance: 5 + diff --git a/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.py b/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.py new file mode 100644 index 00000000000..83725bb13b4 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-03-21/inequality_france_1999.py @@ -0,0 +1,32 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("inequality_france_1999") + + # Read table from meadow dataset. + tb = ds_meadow["inequality_france_1999"].reset_index() + + # + # Process data. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/insee/2024-04-05/inequality_france.meta.yml b/etl/steps/data/garden/insee/2024-04-05/inequality_france.meta.yml new file mode 100644 index 00000000000..7a5e97725b3 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-05/inequality_france.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + description_processing: |- + Each indicator is subdivided in several survey spells that are comparable over time. Different methodologies make spells not directly comparable between each other. + presentation: + topic_tags: + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + inequality_france: + variables: + gini: + title: Gini coefficient - Spell <> + unit: "" + short_unit: "" + description_short: The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. + description_key: + - Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + - Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + presentation: + title_public: Gini coefficient + display: + name: Gini coefficient + numDecimalPlaces: 2 + tolerance: 5 + diff --git a/etl/steps/data/garden/insee/2024-04-05/inequality_france.py b/etl/steps/data/garden/insee/2024-04-05/inequality_france.py new file mode 100644 index 00000000000..f5cc41b8b28 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-05/inequality_france.py @@ -0,0 +1,62 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("inequality_france") + + # Read table from meadow dataset. + tb = ds_meadow["inequality_france"].reset_index() + + # + # Process data. + tb = select_gini_and_create_spells(tb) + + tb = tb.format(["country", "year", "spell"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def select_gini_and_create_spells(tb: Table) -> Table: + """Select only 'Indice de Gini' indicator and create spells.""" + # Select only 'Indice de Gini' indicator. + tb = tb[tb["indicator"] == "Indice de Gini"].reset_index(drop=True) + + # Split year column into two columns: year and spell. The year column is the first four characters of the year column. + tb["year_new"] = tb["year"].str[:4] + + # Define spell as boolean where year is the same as the one before. + tb["spell"] = tb["year_new"] == tb["year_new"].shift(1) + + # Whenever spell is True, set spell to an increasing number. + tb["spell"] = tb["spell"].cumsum() + 1 + + # Drop year and indicator columns and rename year_new to year. + tb = tb.drop(columns=["year", "indicator"]).rename(columns={"year_new": "year"}) + + # Add country column. + tb["country"] = "France" + + # Rename value to gini. + tb = tb.rename(columns={"value": "gini"}) + + return tb diff --git a/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.meta.yml b/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.meta.yml new file mode 100644 index 00000000000..84e41986601 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.meta.yml @@ -0,0 +1,174 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Economic Inequality + processing_level: minor + description_from_producer_inequality: |- + The data from 1996 to 2011 are backcasted to allow a temporal comparison and may therefore differ from those published elsewhere on INSEE presenting the long series with breaks in the series in 2010 and 2012. + description_from_producer_poverty: |- + Includes people surveyed in Metropolitan France, living in a household whose declared income is positive or zero and whose reference person is not a student. + description_key_post_tax_income: Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + description_key_equivalized_income: Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + description_key_relative_poverty: This is a measure of _relative_ poverty – it captures the share of people whose income is low by the standards typical in their own country. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + insee_premiere_1875: + variables: + gini: + title: Gini coefficient + unit: "" + short_unit: "" + description_short: The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. + description_key: + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_inequality}" + presentation: + title_public: Gini coefficient + display: + name: Gini coefficient + numDecimalPlaces: 2 + tolerance: 5 + + p90_p10_ratio: + title: P90/P10 ratio + unit: "" + short_unit: "" + description_short: P90 and P10 are the levels of income below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population. + description_key: + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_inequality}" + presentation: + title_public: P90/P10 ratio + display: + name: P90/P10 ratio + numDecimalPlaces: 1 + tolerance: 5 + + s80_s20_ratio: + title: S80/S20 ratio + unit: "" + short_unit: "" + description_short: The S80/S20 ratio is the share of income received by the richest 20% divided by the share received by the poorest 20%. + description_key: + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_inequality}" + presentation: + title_public: S80/S20 ratio + display: + name: S80/S20 ratio + numDecimalPlaces: 1 + tolerance: 5 + + headcount_50_median: + title: 50% of the median - Number in poverty + unit: "people" + short_unit: "" + description_short: Number of people with an income below 50% of the median. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Number of people below 50% of median income + display: + name: Number of people below 50% of median income + numDecimalPlaces: 0 + tolerance: 5 + + headcount_60_median: + title: 60% of the median - Number in poverty + unit: "people" + short_unit: "" + description_short: Number of people with an income below 60% of the median. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Number of people below 60% of median income + display: + name: Number of people below 60% of median income + numDecimalPlaces: 0 + tolerance: 5 + + headcount_ratio_50_median: + title: 50% of the median - Share in poverty + unit: "%" + short_unit: "%" + description_short: Share of the population with an income below 50% of the median. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Share of population below 50% of median income + display: + name: Share of population below 50% of median income + numDecimalPlaces: 1 + tolerance: 5 + + headcount_ratio_60_median: + title: 60% of the median - Share in poverty + unit: "%" + short_unit: "%" + description_short: Share of the population with an income below 60% of the median. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Share of population below 60% of median income + display: + name: Share of population below 60% of median income + numDecimalPlaces: 1 + tolerance: 5 + + income_gap_ratio_50_median: + title: 50% of the median - Average shortfall (%) + unit: "%" + short_unit: "%" + description_short: This is the average shortfall expressed as a share of the poverty line, sometimes called the 'income gap ratio'. It captures the depth of poverty of those living on less than 50% of the median income. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Average shortfall from a poverty line of 50% of the median income or consumption (as a share of the poverty line) + display: + name: Average shortfall from a poverty line of 50% of the median income or consumption (as a share of the poverty line) + numDecimalPlaces: 1 + tolerance: 5 + + income_gap_ratio_60_median: + title: 60% of the median - Average shortfall (%) + unit: "%" + short_unit: "%" + description_short: This is the average shortfall expressed as a share of the poverty line, sometimes called the 'income gap ratio'. It captures the depth of poverty of those living on less than 60% of the median income. + description_key: + - "{definitions.description_key_relative_poverty}" + - "{definitions.description_key_post_tax_income}" + - "{definitions.description_key_equivalized_income}" + description_from_producer: "{definitions.description_from_producer_poverty}" + presentation: + title_public: Average shortfall from a poverty line of 60% of the median income or consumption (as a share of the poverty line) + display: + name: Average shortfall from a poverty line of 60% of the median income or consumption (as a share of the poverty line) + numDecimalPlaces: 1 + tolerance: 5 + diff --git a/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.py b/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.py new file mode 100644 index 00000000000..4ef305a1392 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-25/insee_premiere_1875.py @@ -0,0 +1,54 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define columns to keep +COLUMNS_TO_KEEP = [ + "p90_p10_ratio", + "s80_s20_ratio", + "gini", + "headcount_50_median", + "headcount_60_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("insee_premiere_1875") + + # Read table from meadow dataset. + tb_inequality = ds_meadow["inequality"].reset_index() + tb_poverty = ds_meadow["poverty"].reset_index() + + # Merge both tables + tb = pr.merge(tb_inequality, tb_poverty, on=["country", "year"], how="outer", short_name=paths.short_name) + + # + # Process data. + tb = tb.format(["country", "year"]) + + # Keep relevant columns + tb = tb[COLUMNS_TO_KEEP] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.meta.yml b/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.meta.yml new file mode 100644 index 00000000000..6ba402149f3 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.meta.yml @@ -0,0 +1,49 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + display: + name: Share in relative poverty + numDecimalPlaces: 1 + processing_level: major + description_key: + - Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + - Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + presentation: + title_public: Share in relative poverty + topic_tags: + - Poverty + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + relative_poverty_france: + variables: + headcount_ratio_40_median: + title: 40% of the median - Share of population in poverty + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income below 50% of the median. + + headcount_ratio_50_median: + title: 50% of the median - Share of population in poverty + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income below 50% of the median. + + headcount_ratio_60_median: + title: 60% of the median - Share of population in poverty + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income below 60% of the median. + + headcount_ratio_70_median: + title: 70% of the median - Share of population in poverty + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income below 70% of the median. diff --git a/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.py b/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.py new file mode 100644 index 00000000000..8385108762c --- /dev/null +++ b/etl/steps/data/garden/insee/2024-04-26/relative_poverty_france.py @@ -0,0 +1,54 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("relative_poverty_france") + + # Read table from meadow dataset. + tb = ds_meadow["relative_poverty_france"].reset_index() + + # + # Process data. + tb = create_spells(tb) + + tb = tb.format(["country", "year", "spell"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def create_spells(tb: Table) -> Table: + """Define spells using a new column""" + + # Split year column into two columns: year and spell. The year column is the first four characters of the year column. + tb["year_new"] = tb["year"].str[:4] + + # Define spell as boolean where year is the same as the one before. + tb["spell"] = tb["year_new"] == tb["year_new"].shift(1) + + # Whenever spell is True, set spell to an increasing number. + tb["spell"] = tb["spell"].cumsum() + 1 + + # Drop year and indicator columns and rename year_new to year. + tb = tb.drop(columns=["year"]).rename(columns={"year_new": "year"}) + + return tb diff --git a/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.meta.yml b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.meta.yml new file mode 100644 index 00000000000..2d5c15ad2fe --- /dev/null +++ b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.meta.yml @@ -0,0 +1,34 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Economic Inequality + processing_level: minor + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + interdecile_ratio: + variables: + p90_p50_ratio: + title: P90/P50 ratio + unit: "" + short_unit: "" + description_short: The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median income or consumption. + description_key: + - Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + - Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + description_from_producer: Data includes people from Metropolitan France until 1999, France (mainland + overseas territories) since 2000 + presentation: + title_public: P90/P50 ratio + display: + name: P90/P50 ratio + numDecimalPlaces: 1 + tolerance: 5 + diff --git a/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.py b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.py new file mode 100644 index 00000000000..fee118bf5a0 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio.py @@ -0,0 +1,38 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define columns to keep +COLUMNS_TO_KEEP = ["p90_p50_ratio"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("interdecile_ratio") + + # Read table from meadow dataset. + tb = ds_meadow["interdecile_ratio"].reset_index() + + # + # Process data. + tb = tb.format(["country", "year"]) + + # Keep relevant columns + tb = tb[COLUMNS_TO_KEEP] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.meta.yml b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.meta.yml new file mode 100644 index 00000000000..322b254cc87 --- /dev/null +++ b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.meta.yml @@ -0,0 +1,34 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Economic Inequality + processing_level: minor + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + interdecile_ratio_2022: + variables: + p90_p50_ratio: + title: P90/P50 ratio + unit: "" + short_unit: "" + description_short: The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median income or consumption. + description_key: + - Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + - Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + description_from_producer: Data includes people from Metropolitan France until 1999, France (mainland + overseas territories) since 2000 + presentation: + title_public: P90/P50 ratio + display: + name: P90/P50 ratio + numDecimalPlaces: 1 + tolerance: 5 + diff --git a/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.py b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.py new file mode 100644 index 00000000000..70a1ccfffbf --- /dev/null +++ b/etl/steps/data/garden/insee/2024-05-23/interdecile_ratio_2022.py @@ -0,0 +1,38 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define columns to keep +COLUMNS_TO_KEEP = ["p90_p50_ratio"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("interdecile_ratio_2022") + + # Read table from meadow dataset. + tb = ds_meadow["interdecile_ratio_2022"].reset_index() + + # + # Process data. + tb = tb.format(["country", "year"]) + + # Keep relevant columns + tb = tb[COLUMNS_TO_KEEP] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.countries.json b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.countries.json new file mode 100644 index 00000000000..3f340d53566 --- /dev/null +++ b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.countries.json @@ -0,0 +1,203 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas, The": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Czechoslovakia": "Czechoslovakia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Republic of": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Reunion": "Reunion", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Serbia and Montenegro": "Serbia and Montenegro", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Soviet Union": "USSR", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Yemen": "Yemen", + "Yugoslavia": "Yugoslavia", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Congo, Democratic Republic of the": "Democratic Republic of Congo", + "Congo, Republic of the": "Congo", + "Micronesia, Federated States of": "Micronesia (country)", + "Turkiye": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.meta.yml b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.meta.yml new file mode 100644 index 00000000000..1d61c3ccc0b --- /dev/null +++ b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.meta.yml @@ -0,0 +1,31 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + presentation: + topic_tags: + - Economic Inequality + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + world_income_inequality_database: + variables: + gini: + title: Gini index + unit: "" + short_unit: "" + description_short: The [Gini index](#dod:gini) measures inequality on a scale from 0 to 100. Higher values indicate higher inequality. Inequality is measured here in terms of different types of income and consumption. + processing_level: minor + presentation: + title_public: Gini index + display: + name: Gini index + numDecimalPlaces: 2 + tolerance: 5 + diff --git a/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.py b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.py new file mode 100644 index 00000000000..fa3a52958f4 --- /dev/null +++ b/etl/steps/data/garden/unu_wider/2024-04-22/world_income_inequality_database.py @@ -0,0 +1,55 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define index variables +INDEX_VARS = [ + "country", + "year", + "source_detailed", + "resource_detailed", + "scale_detailed", + "sharing_unit", + "reference_unit", + "areacovr_detailed", + "popcovr_detailed", + "source_comments", + "survey", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("world_income_inequality_database") + + # Read table from meadow dataset. + tb = ds_meadow["world_income_inequality_database"].reset_index() + + # + # Process data. + # Select only gini + tb = tb[INDEX_VARS + ["gini"]] + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + tb = tb.format(INDEX_VARS) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py b/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py new file mode 100644 index 00000000000..ae67f6415cb --- /dev/null +++ b/etl/steps/data/meadow/chartbook/2024-04-22/concialdi.py @@ -0,0 +1,29 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("concialdi.csv") + + # Load data from snapshot. + tb = snap.read() + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py b/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py new file mode 100644 index 00000000000..79b7d5abd00 --- /dev/null +++ b/etl/steps/data/meadow/chartbook/2024-05-23/wealth_france.py @@ -0,0 +1,35 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("wealth_france.csv") + + # Load data from snapshot. + tb = snap.read() + + # Add country + tb["country"] = "France" + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py b/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py new file mode 100644 index 00000000000..af1241b8517 --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-03-21/inequality_france_1999.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("inequality_france_1999.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py b/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py new file mode 100644 index 00000000000..bbef1e787ab --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-04-05/inequality_france.py @@ -0,0 +1,40 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("inequality_france.xlsx") + + # Load data from snapshot. + tb = snap.read(header=2) + + # + # Process data. + # Make column names strings + tb.columns = tb.columns.astype(str) + + # Make table long + tb = tb.melt(id_vars=["Indicateur"], var_name="year", value_name="value") + + # Rename Indicateur to indicator + tb = tb.rename(columns={"Indicateur": "indicator"}) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["indicator", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py b/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py new file mode 100644 index 00000000000..9cf292a8b75 --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-04-25/insee_premiere_1875.py @@ -0,0 +1,124 @@ +"""Load a snapshot and create a meadow dataset.""" + +from typing import Dict + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define columns and their new names +COLUMNS_INEQUALITY = { + "Unnamed: 0": "year", + "Ratio \nInter-décile D9/D1": "p90_p10_ratio", + "Ratio\n(100-S80)/S20": "s80_s20_ratio", + "Indice \nDe Gini ": "gini", +} + +COLUMNS_POVERTY = { + "Nombre de personnes pauvres (en milliers)": "headcount", + "Taux de pauvreté (en %)": "headcount_ratio", + "Seuil de pauvreté (en euros 2019/mois)": "poverty_line", + "Niveau de vie médian des personnes pauvres (en euros 2019/mois)": "average_income_in_poverty", + "Intensité de la pauvreté (en %)": "income_gap_ratio", +} + +# Define relative poverty names +RELATIVE_POVERTY = {"Seuil à 60 % de la médiane": "60_median", "Seuil à 50 % de la médiane": "50_median"} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("insee_premiere_1875.xlsx") + + # Load data from snapshot. + tb_inequality = snap.read(sheet_name="Figure 2", skiprows=2) + tb_poverty = snap.read(sheet_name="Figure 3", skiprows=2) + + # Process data. + tb_inequality = process_inequality_data(tb=tb_inequality, columns=COLUMNS_INEQUALITY, short_name="inequality") + tb_poverty = process_poverty_data( + tb=tb_poverty, columns=COLUMNS_POVERTY, relative_poverty_names=RELATIVE_POVERTY, short_name="poverty" + ) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=[tb_inequality, tb_poverty], check_variables_metadata=True, default_metadata=snap.metadata + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def process_inequality_data(tb: Table, columns: Dict[str, str], short_name: str) -> Table: + """ + Process the inequality data. + """ + # Rename columns + tb = tb.rename(columns=columns, errors="raise") + + # Make year integer + # If year row is string, delete + tb = tb[tb["year"].astype(str).str.isnumeric()].reset_index(drop=True) + + # If year row is float, convert to integer + + tb["year"] = tb["year"].astype(int) + + # Add country + tb["country"] = "France" + + # Format + tb = tb.format(short_name=short_name) + + return tb + + +def process_poverty_data( + tb: Table, columns: Dict[str, str], relative_poverty_names: Dict[str, str], short_name: str +) -> Table: + """ + Process the poverty data. + """ + + tb = tb.copy() + + # Rename first column + tb = tb.rename(columns={"Unnamed: 0": "indicator"}, errors="raise") + + # Create a new column, relative_poverty, when the indicator is in relative_poverty_names.keys() + tb["relative_poverty"] = tb["indicator"].map(relative_poverty_names) + + # Fill the NaN values in the relative_poverty column with the last non missing value + tb["relative_poverty"] = tb["relative_poverty"].ffill() + + # Change the names of the indicator column using colums dictionary + tb = tb[tb["indicator"].isin(columns.keys())].reset_index(drop=True) + tb["indicator"] = tb["indicator"].map(columns) + + # Create indicator as the concatenation of relative_poverty and indicator + tb["indicator"] = tb["indicator"] + "_" + tb["relative_poverty"] + + # Drop relative_poverty column + tb = tb.drop(columns=["relative_poverty"]) + + tb = tb.melt(id_vars=["indicator"], var_name="year", value_name="value") + + # Make table wide + tb = tb.pivot(index="year", columns="indicator", values="value").reset_index() + + # Add country + tb["country"] = "France" + + # Format + tb = tb.format(short_name=short_name) + + return tb diff --git a/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py b/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py new file mode 100644 index 00000000000..1803b72cfe3 --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-04-26/relative_poverty_france.py @@ -0,0 +1,77 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Relative poverty lines and their new names +POVERTY_LINES = { + "Seuil à 40 %": "headcount_ratio_40_median", + "Seuil à 50 %": "headcount_ratio_50_median", + "Seuil à 60 %": "headcount_ratio_60_median", + "Seuil à 70 %": "headcount_ratio_70_median", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("relative_poverty_france.xlsx") + + # Load data from snapshot. + tb = snap.read(sheet_name="Données", skiprows=3) + + # + # Process data. + tb = reformat_table(tb) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def reformat_table(tb: Table) -> Table: + """ + Keep only rows with data, transpose and add country column. + """ + + tb = tb.copy() + + # In the first column, keep only rows starting with "Seuil" + tb = tb[tb.iloc[:, 0].str.startswith("Seuil")] + + # Rename categories in the first column + tb.iloc[:, 0] = tb.iloc[:, 0].map(POVERTY_LINES) + + # Set the first column as the index + tb = tb.set_index(tb.columns[0]) + + # Invert the table + tb_transposed = tb.copy().T + + # Copy metadata from tb + for col in tb_transposed.columns: + tb_transposed[col] = tb_transposed[col].copy_metadata(tb[1975]) + + # Reset index and rename first column to "year" + tb_transposed = tb_transposed.reset_index() + tb_transposed = tb_transposed.rename(columns={"index": "year"}) + tb_transposed["year"] = tb_transposed["year"].astype("string") + + # Add country + tb_transposed["country"] = "France" + + return tb_transposed diff --git a/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio.py b/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio.py new file mode 100644 index 00000000000..1aecc0b1a7a --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio.py @@ -0,0 +1,47 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define original columns and their new names +COLUMNS = { + "Libellé": "year", + "Rapport interdécile D9/D5 du salaire net annuel en équivalent temps plein dans le secteur privé - Ensemble des salariés": "p90_p50_ratio", + "Codes": "codes", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("interdecile_ratio.csv") + + # Load data from snapshot. + tb = snap.read(sep=";") + + # Process data. + + # Rename columns + tb = tb.rename(columns=COLUMNS) + + # Remove all rows in year that are not numeric + tb = tb[tb["year"].str.isnumeric()].reset_index(drop=True) + + # Add country + tb["country"] = "France" + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio_2022.py b/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio_2022.py new file mode 100644 index 00000000000..fe1c3780c2b --- /dev/null +++ b/etl/steps/data/meadow/insee/2024-05-23/interdecile_ratio_2022.py @@ -0,0 +1,50 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Define original columns and their new names +COLUMNS = { + "Label": "year", + "Inter-decile ratio D9/D5 of the annual net salary for full-time jobs - All salaried workers": "p90_p50_ratio", + "Codes": "codes", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("interdecile_ratio_2022.csv") + + # Load data from snapshot. + tb = snap.read(sep=";") + + # Process data. + + # Rename columns + tb = tb.rename(columns=COLUMNS) + + # Remove all rows in year that are not numeric + tb = tb[tb["year"].str.isnumeric()].reset_index(drop=True) + + # Make p90_p50_ratio float + tb["p90_p50_ratio"] = tb["p90_p50_ratio"].str.replace(",", ".").astype(float) + + # Add country + tb["country"] = "France" + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py b/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py new file mode 100644 index 00000000000..b7c63ebe650 --- /dev/null +++ b/etl/steps/data/meadow/unu_wider/2024-04-22/world_income_inequality_database.py @@ -0,0 +1,43 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("world_income_inequality_database.xlsx") + + # Load data from snapshot. + tb = snap.read(sheet_name="Sheet1") + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format( + [ + "country", + "year", + "source_detailed", + "resource_detailed", + "scale_detailed", + "sharing_unit", + "reference_unit", + "areacovr_detailed", + "popcovr_detailed", + "source_comments", + "survey", + ] + ) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/chartbook/2024-04-22/concialdi.csv.dvc b/snapshots/chartbook/2024-04-22/concialdi.csv.dvc new file mode 100644 index 00000000000..43fb060d08f --- /dev/null +++ b/snapshots/chartbook/2024-04-22/concialdi.csv.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Concialdi's data on income inequality + description: |- + Concialdi data about income inequality in France, published in the book Changing patterns in the distribution of economic welfare: An international perspective. + date_published: "1997-01-01" + + # Citation + producer: Concialdi + citation_full: |- + Concialdi, P, 1997, “Income distribution in France : The mid-1980s turning point” in P Gottschalk, B Gustafssson and E Palmer, editors, Changing patterns in the distribution of economic welfare: An international perspective, Cambridge University Press, Cambridge. + Table 11.11 + + # Files + url_main: https://www.cambridge.org/us/universitypress/subjects/economics/public-economics-and-public-policy/changing-patterns-distribution-economic-welfare-economic-perspective + date_accessed: 2024-04-22 + + # License + license: + name: CC BY 4.0 + +outs: + - md5: 7ef740a49ca80c9d549eebb432eb2805 + size: 63 + path: concialdi.csv diff --git a/snapshots/chartbook/2024-04-22/concialdi.py b/snapshots/chartbook/2024-04-22/concialdi.py new file mode 100644 index 00000000000..de1e585f2a4 --- /dev/null +++ b/snapshots/chartbook/2024-04-22/concialdi.py @@ -0,0 +1,33 @@ +""" +Script to create a snapshot of dataset. + +The file comes from Table 11.11 in the original paper, availble in this book https://www.cambridge.org/us/universitypress/subjects/economics/public-economics-and-public-policy/changing-patterns-distribution-economic-welfare-economic-perspective +Due to the paywall, I am creating a csv file from the data extracted in the past by the Chartbook team. See https://docs.google.com/spreadsheets/d/1sySmwtRs_MvRrcVj52o9b0nG-bzbPMu1haIoSK8IWqk/edit?gid=1521965312#gid=1521965312 +After creating the file, run + python snapshots/chartbook/2024-04-22/concialdi.py --path-to-file + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"chartbook/{SNAPSHOT_VERSION}/concialdi.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/chartbook/2024-05-23/wealth_france.csv.dvc b/snapshots/chartbook/2024-05-23/wealth_france.csv.dvc new file mode 100644 index 00000000000..9abcab6b41b --- /dev/null +++ b/snapshots/chartbook/2024-05-23/wealth_france.csv.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: "Wealth Concentration in a Developing Economy: Paris and France, 1807-1994" + description: |- + Using large samples of estate tax returns, we construct new series on wealth concentration in Paris and France from 1807 to 1994. Inequality increased until 1914 because industrial and financial estates grew dramatically. Then, adverse shocks, rather than a Kuznets-type process, led to a massive decline in inequality. The very high wealth concentration prior to 1914 benefited retired individuals living off capital income (rentiers) rather than entrepreneurs. The very rich were in their seventies and eighties, whereas they had been in their fifties a half century earlier and would be so again after World War II. Our results shed new light on ongoing debates about wealth inequality and growth. + date_published: "2006-03-01" + + # Citation + producer: Piketty et al. + citation_full: |- + Piketty, Thomas, Gilles Postel-Vinay, and Jean-Laurent Rosenthal. 2006. "Wealth Concentration in a Developing Economy: Paris and France, 1807-1994." American Economic Review, 96 (1): 236-256, Table 2. + + # Files + url_main: https://www.aeaweb.org/articles?id=10.1257/000282806776157614 + date_accessed: 2024-05-23 + + # License + license: + name: CC BY 4.0 + url: https://www.aeaweb.org/articles?id=10.1257/000282806776157614 + +outs: + - md5: 66e56ad3a768a60b3ea1af468da7c804 + size: 378 + path: wealth_france.csv diff --git a/snapshots/chartbook/2024-05-23/wealth_france.py b/snapshots/chartbook/2024-05-23/wealth_france.py new file mode 100644 index 00000000000..377757bad23 --- /dev/null +++ b/snapshots/chartbook/2024-05-23/wealth_france.py @@ -0,0 +1,29 @@ +""" +Script to create a snapshot of dataset. + +The csv file is extracted from table 2 of the paper "Wealth Concentration in a Developing Economy: Paris and France, 1807-1994", available at https://www.aeaweb.org/articles?id=10.1257/000282806776157614. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"chartbook/{SNAPSHOT_VERSION}/wealth_france.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-03-21/inequality_france_1999.csv.dvc b/snapshots/insee/2024-03-21/inequality_france_1999.csv.dvc new file mode 100644 index 00000000000..9de00b9fba1 --- /dev/null +++ b/snapshots/insee/2024-03-21/inequality_france_1999.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Household income and wealth + description: |- + Public statistics in France about household income and wealth. + date_published: "1999-09-01" + version_producer: 1999 edition + title_snapshot: Household income and wealth - Gini coefficient + description_snapshot: |- + Gini coefficient estimates for equivalized disposable household income in France between 1970 and 1996. + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (1999) Syntheses. Revenus et patrimoine des ménages, édition 1999 [Syntheses. Household income and wealth]. Table 10, page 32. + attribution_short: INSEE + + # Files + url_main: https://www.bnsp.insee.fr/ark:/12148/bc6p06xz84t.r=hourriez?rk=85837;2# + date_accessed: 2024-03-21 + + # License + license: + name: CC BY 4.0 + url: https://www.bnsp.insee.fr/ark:/12148/bc6p06xz84t.r=hourriez?rk=85837;2# + +outs: + - md5: a6e161b219db7fd1983ede5eb6e6d9f4 + size: 145 + path: inequality_france_1999.csv diff --git a/snapshots/insee/2024-03-21/inequality_france_1999.py b/snapshots/insee/2024-03-21/inequality_france_1999.py new file mode 100644 index 00000000000..59299a5037b --- /dev/null +++ b/snapshots/insee/2024-03-21/inequality_france_1999.py @@ -0,0 +1,29 @@ +""" +Script to create a snapshot of dataset. +The file was created by manually creating a csv file from the data contained in the last row of the Table 10 (p. 32), in a long format. +The book is available at this link https://www.bnsp.insee.fr/ark:/12148/bc6p06xz84t/f1.pdf +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/inequality_france_1999.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-04-05/inequality_france.py b/snapshots/insee/2024-04-05/inequality_france.py new file mode 100644 index 00000000000..fa35ff79518 --- /dev/null +++ b/snapshots/insee/2024-04-05/inequality_france.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/inequality_france.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-04-05/inequality_france.xlsx.dvc b/snapshots/insee/2024-04-05/inequality_france.xlsx.dvc new file mode 100644 index 00000000000..85f820f8091 --- /dev/null +++ b/snapshots/insee/2024-04-05/inequality_france.xlsx.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Standard of living and inequality indicators + description: |- + Public statistics in France about household income. + date_published: "2023-11-14" + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (2023) Niveau de vie et indicateurs d’inégalités. Données annuelles de 1975 à 2021 [Standard of living and inequality indicators. Annual data from 1975 to 2021]. Retrieved from https://www.insee.fr/fr/statistiques/2491918 on 5 April 2024. + attribution_short: INSEE + + # Files + url_main: https://www.insee.fr/fr/statistiques/2491918 + url_download: https://www.insee.fr/fr/statistiques/fichier/2491918/reve-niv-vie-indic-inegalite.xlsx + date_accessed: 2024-04-05 + + # License + license: + name: Mentions légales et crédits [Legal notices and credits] + url: https://www.insee.fr/fr/information/2008466 + +outs: + - md5: 2e902f9c80e05f52ac3b81931c8658b7 + size: 18310 + path: inequality_france.xlsx diff --git a/snapshots/insee/2024-04-25/insee_premiere_1875.py b/snapshots/insee/2024-04-25/insee_premiere_1875.py new file mode 100644 index 00000000000..7c8faa499ad --- /dev/null +++ b/snapshots/insee/2024-04-25/insee_premiere_1875.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/insee_premiere_1875.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-04-25/insee_premiere_1875.xlsx.dvc b/snapshots/insee/2024-04-25/insee_premiere_1875.xlsx.dvc new file mode 100644 index 00000000000..042f4bc8753 --- /dev/null +++ b/snapshots/insee/2024-04-25/insee_premiere_1875.xlsx.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: "INSEE Premiere 1875: In 2019, the median standard of living increases significantly and the poverty rate decreases" + description: |- + Inequality and distributional report of the main statistical office in France. + date_published: "2021-10-05" + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (2021) En 2019, le niveau de vie médian augmente nettement et le taux de pauvreté diminue [In 2019, the median standard of living increases significantly and the poverty rate decreases]. Retrieved from https://www.insee.fr/fr/statistiques/5431993 on 5 April 2024. + attribution_short: INSEE + + # Files + url_main: https://www.insee.fr/fr/statistiques/5431993 + url_download: https://www.insee.fr/fr/statistiques/fichier/5431993/ip1875.xlsx + date_accessed: 2024-04-25 + + # License + license: + name: Mentions légales et crédits [Legal notices and credits] + url: https://www.insee.fr/fr/information/2008466 + +outs: + - md5: 0fbfbe98ff1f0f0811b068a20a856551 + size: 32719 + path: insee_premiere_1875.xlsx diff --git a/snapshots/insee/2024-04-26/relative_poverty_france.py b/snapshots/insee/2024-04-26/relative_poverty_france.py new file mode 100644 index 00000000000..d47ae44f55a --- /dev/null +++ b/snapshots/insee/2024-04-26/relative_poverty_france.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/relative_poverty_france.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-04-26/relative_poverty_france.xlsx.dvc b/snapshots/insee/2024-04-26/relative_poverty_france.xlsx.dvc new file mode 100644 index 00000000000..79a78a60da8 --- /dev/null +++ b/snapshots/insee/2024-04-26/relative_poverty_france.xlsx.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Poverty rates by poverty line + description: |- + Public statistics in France about relative poverty. + date_published: "2023-11-14" + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (2023) Taux de pauvreté selon le seuil de pauvreté. Données annuelles de 1975 à 2021 [Poverty rates by poverty line. Annual data from 1975 to 2021]. Retrieved from https://www.insee.fr/fr/statistiques/2408282 on 26 April 2024. + attribution_short: INSEE + + # Files + url_main: https://www.insee.fr/fr/statistiques/2408282 + url_download: https://www.insee.fr/fr/statistiques/fichier/2408282/reve-pauv-taux-seuil.xlsx + date_accessed: 2024-04-26 + + # License + license: + name: Mentions légales et crédits [Legal notices and credits] + url: https://www.insee.fr/fr/information/2008466 + +outs: + - md5: 5cadaf8ff95502aa51f8947ccb567504 + size: 33183 + path: relative_poverty_france.xlsx diff --git a/snapshots/insee/2024-05-23/interdecile_ratio.csv.dvc b/snapshots/insee/2024-05-23/interdecile_ratio.csv.dvc new file mode 100644 index 00000000000..b4470969d22 --- /dev/null +++ b/snapshots/insee/2024-05-23/interdecile_ratio.csv.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: D9/D5 interdecile ratio for net salary in France + description: |- + D9/D5 interdecile ratio of annual net salary in full-time equivalent in the private sector - All employees. + date_published: "2023-12-13" + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (2023) Rapport interdécile D9/D5 du salaire net annuel en équivalent temps plein dans le secteur privé - Ensemble des salariés [D9/D5 interdecile ratio of annual net salary in full-time equivalent in the private sector - All employees]. + attribution_short: INSEE + + # Files + url_main: https://www.insee.fr/fr/statistiques/serie/010752360 + date_accessed: 2024-05-23 + + # License + license: + name: Mentions légales et crédits [Legal notices and credits] + url: https://www.insee.fr/fr/information/2008466 + +outs: + - md5: 237111f1c668f38a614f3559cf7fe03d + size: 715 + path: interdecile_ratio.csv diff --git a/snapshots/insee/2024-05-23/interdecile_ratio.py b/snapshots/insee/2024-05-23/interdecile_ratio.py new file mode 100644 index 00000000000..63c008aec53 --- /dev/null +++ b/snapshots/insee/2024-05-23/interdecile_ratio.py @@ -0,0 +1,34 @@ +""" +Script to create a snapshot of dataset. + +To download the dataset + 1. Click on the Télécharger button tab. + 2. Select the oldest Date de début and the latest Date de fin. + 3. Click on the Télécharger (csv) button. + 4. Extract the valeurs_annuelles.csv file from the downloaded zip file. + 5. Run the script with the path to the extracted file. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/interdecile_ratio.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/insee/2024-05-23/interdecile_ratio_2022.csv.dvc b/snapshots/insee/2024-05-23/interdecile_ratio_2022.csv.dvc new file mode 100644 index 00000000000..e3c035e6536 --- /dev/null +++ b/snapshots/insee/2024-05-23/interdecile_ratio_2022.csv.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: D9/D5 interdecile ratio for net salary in France + description: |- + D9/D5 interdecile ratio of annual net salary in full-time equivalent in the private sector - All employees + date_published: "2022" + + # Citation + producer: National Institute of Statistics and Economic Studies + citation_full: |- + INSEE (2022) Rapport interdécile D9/D5 du salaire net annuel en équivalent temps plein dans le secteur privé - Ensemble des salariés [D9/D5 interdecile ratio of annual net salary in full-time equivalent in the private sector - All employees]. + attribution_short: INSEE + + # Files + url_main: https://www.insee.fr/fr/statistiques/serie/010752360 + date_accessed: 2024-05-23 + + # License + license: + name: Mentions légales et crédits [Legal notices and credits] + url: https://www.insee.fr/fr/information/2008466 + +outs: + - md5: 9b88608d4dfbd4c71d1e34c589fd96b5 + size: 957 + path: interdecile_ratio_2022.csv diff --git a/snapshots/insee/2024-05-23/interdecile_ratio_2022.py b/snapshots/insee/2024-05-23/interdecile_ratio_2022.py new file mode 100644 index 00000000000..d1e87a0e735 --- /dev/null +++ b/snapshots/insee/2024-05-23/interdecile_ratio_2022.py @@ -0,0 +1,30 @@ +""" +Script to create a snapshot of dataset. + +The file was obtained from a previous version of the dataset, accessed in 2022 from the INSEE website by the Chartbook team. +You can find that file here: https://drive.google.com/file/d/1-Wk2hJo2gyelmJ32k1eZCBE-TfQcQEhD/view +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"insee/{SNAPSHOT_VERSION}/interdecile_ratio_2022.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/unu_wider/2024-04-22/world_income_inequality_database.py b/snapshots/unu_wider/2024-04-22/world_income_inequality_database.py new file mode 100644 index 00000000000..ce3082f4b47 --- /dev/null +++ b/snapshots/unu_wider/2024-04-22/world_income_inequality_database.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"unu_wider/{SNAPSHOT_VERSION}/world_income_inequality_database.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/unu_wider/2024-04-22/world_income_inequality_database.xlsx.dvc b/snapshots/unu_wider/2024-04-22/world_income_inequality_database.xlsx.dvc new file mode 100644 index 00000000000..43ab3485c7c --- /dev/null +++ b/snapshots/unu_wider/2024-04-22/world_income_inequality_database.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: World Income Inequality Database (WIID) + description: |- + The World Income Inequality Database (WIID) presents information on income inequality for developed, developing, and transition countries. It provides the most comprehensive set of income inequality statistics available and can be downloaded for free. ​ + date_published: "2023-11-28" + + # Citation + producer: UNU-WIDER + citation_full: |- + UNU-WIDER, World Income Inequality Database (WIID). Version 28 November 2023. https://doi.org/10.35188/UNU-WIDER/WIID-281123 + + # Files + url_main: https://www.wider.unu.edu/project/world-income-inequality-database-wiid + url_download: https://www.wider.unu.edu/sites/default/files/Data/WIID_28NOV2023.xlsx + date_accessed: 2024-04-22 + + # License + license: + name: CC BY 4.0 + url: https://www.wider.unu.edu/project/world-income-inequality-database-wiid + +outs: + - md5: 4d996ca2b59dc65bee980bddc429cba7 + size: 9070639 + path: world_income_inequality_database.xlsx