diff --git a/dag/main.yml b/dag/main.yml index f06b7e8b305..e0ee162104e 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -257,6 +257,7 @@ steps: data://garden/happiness/2024-06-09/happiness: - data://meadow/happiness/2024-06-09/happiness + - data://meadow/happiness/2024-06-20/happiness_ages - data://garden/happiness/2023-03-20/happiness - data://garden/demography/2023-03-31/population - data://garden/regions/2023-01-01/regions @@ -264,6 +265,9 @@ steps: data://grapher/happiness/2024-06-09/happiness: - data://garden/happiness/2024-06-09/happiness + data://meadow/happiness/2024-06-20/happiness_ages: + - snapshot://happiness/2024-06-20/happiness_ages.xls + # LGBTI Policy Index (Velasco, 2020) data://meadow/lgbt_rights/2023-04-27/lgbti_policy_index: diff --git a/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml b/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml index 31e6c6f43ef..e387616b2f3 100644 --- a/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml +++ b/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml @@ -1,9 +1,85 @@ +# NOTE: To learn more about the fields, hover over their names. definitions: common: - processing_level: major presentation: topic_tags: - Happiness & Life Satisfaction + attribution_short: WHR + display: + numDecimalPlaces: 2 + processing_level: major + origins: + # Data product / Snapshot + - title: World Happiness Report + description: |- + The World Happiness Report is a partnership of Gallup, the Oxford Wellbeing Research Centre, the UN Sustainable Development Solutions Network, and the WHR’s Editorial Board. + It reviews the state of happiness in the world today and shows how the science of happiness explains personal and national variations in happiness. + date_published: "2024-03-08" + version_producer: 2024 + # Citation + producer: Wellbeing Research Centre + citation_full: |- + Helliwell, J. F., Layard, R., Sachs, J. D., De Neve, J.-E., Aknin, L. B., & Wang, S. (Eds.). (2024). World Happiness Report 2024. University of Oxford: Wellbeing Research Centre. + attribution_short: WHR + # Files + url_main: https://worldhappiness.report/ed/2024/ + date_accessed: 2024-06-20 + # License + license: + name: "" + url: https://worldhappiness.report/ed/2024/ + description_short: + "Self-reported life satisfaction is the answer to the question: + 'Imagine a ladder with the best possible life being a 10, and the worst possible life being a 0. + Which step on that ladder would you say you stand on right now?'" + description_processing: + Average of regions is calculated by taking a population-weighted average over all countries within that region. + Since data per age group is not available for all countries, regional aggregates can sometimes differ. + description_key: + - The Cantril ladder asks respondents to think of a ladder, with the best possible life for them being a 10 and the worst possible life being a 0. They are then asked to rate their own current lives on that 0 to 10 scale. + - The rankings are calculated by the source based on nationally representative samples for the three years prior to the year of the report, so that data for the 2024 report will draw from survey data from 2021-2023. We show the data for final year of the three-year survey period, i.e. we show the 2021-2023 survey data as 2023. + - The only exception is the data for the 2012 report, which uses survey data from 2005-2011, we show this data as the final year of the survey data - 2011. + - The number of people and countries surveyed varies year to year, but typically more than 100,000 people in 130 countries participate in the Gallup World Poll each year. + - The rankings are based entirely on the survey scores, using the Gallup weights to make the estimates representative. + - The data is the compilation of all previous World Happiness Reports, which can be found at https://worldhappiness.report/archive/. + metric: + title: + <% if age_group == "below 30" %> + Life satisfaction of people below 30 + <% elif age_group == "30-44" %> + Life satisfaction of people aged 30-44 + <% elif age_group == "45-59" %> + Life satisfaction of people aged 45-59 + <% elif age_group == "60 and above" %> + Life satisfaction of people aged 60 and above + <% elif age_group == "all ages" %> + Cantril ladder score + <% endif %> + unit: "" + short_unit: "" + description_short: + <% if age_group == "all ages" %> + Average of survey responses to the 'Cantril Ladder' question in the Gallup World Poll. The survey question asks respondents to think of a ladder, with the best possible life for them being a 10, and the worst possible life being a 0. + <% endif %> + presentation: + attribution: + <% if age_group == "all ages" %> + World Happiness Report (2012-2024) + <% endif %> + title_public: + <% if age_group == "all ages" %> + Self-reported life satisfaction + <% elif age_group == "below 30" %> + Life satisfaction of people below 30 + <% elif age_group == "30-44" %> + Life satisfaction of people aged 30-44 + <% elif age_group == "45-59" %> + Life satisfaction of people aged 45-59 + <% elif age_group == "60 and above" %> + Life satisfaction of people aged 60 and above + <% endif %> + + dataset: @@ -13,22 +89,15 @@ dataset: tables: happiness: variables: - cantril_ladder_score: - title: Cantril ladder score - unit: "" - short_unit: "" - description_short: Average of survey responses to the 'Cantril Ladder' question in the Gallup World Poll. The survey question asks respondents to think of a ladder, with the best possible life for them being a 10, and the worst possible life being a 0. - description_processing: Average of regions is calculated by taking a population-weighted average over all countries within that region. - description_key: - - The Cantril ladder asks respondents to think of a ladder, with the best possible life for them being a 10 and the worst possible life being a 0. They are then asked to rate their own current lives on that 0 to 10 scale. - - The rankings are calculated by the source based on nationally representative samples for the three years prior to the year of the report, so that data for the 2024 report will draw from survey data from 2021-2023. We show the data for final year of the three-year survey period, i.e. we show the 2021-2023 survey data as 2023. - - The only exception is the data for the 2012 report, which uses survey data from 2005-2011, we show this data as the final year of the survey data - 2011. - - The number of people and countries surveyed varies year to year, but typically more than 100,000 people in 130 countries participate in the Gallup World Poll each year. - - The rankings are based entirely on the survey scores, using the Gallup weights to make the estimates representative. - - The data is the compilation of all previous World Happiness Reports, which can be found at https://worldhappiness.report/archive/. + happiness_score: + title: |- + {definitions.metric.title} + unit: |- + {definitions.metric.unit} + description_short: |- + {definitions.metric.description_short} presentation: - attribution: World Happiness Report (2012-2024) - attribution_short: WHR - title_public: Self-reported life satisfaction - display: - numDecimalPlaces: 2 \ No newline at end of file + attribution: |- + {definitions.metric.presentation.attribution} + title_public: |- + {definitions.metric.presentation.title_public} diff --git a/etl/steps/data/garden/happiness/2024-06-09/happiness.py b/etl/steps/data/garden/happiness/2024-06-09/happiness.py index 3edd2c6b15e..7ae578d6ea4 100644 --- a/etl/steps/data/garden/happiness/2024-06-09/happiness.py +++ b/etl/steps/data/garden/happiness/2024-06-09/happiness.py @@ -1,6 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" - import owid.catalog.processing as pr +from owid.catalog import Dataset, Table from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -9,23 +9,39 @@ paths = PathFinder(__file__) -REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"} -REGIONS.update({"World": {}}) +ALL_REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"} +ALL_REGIONS.update({"World": {}}) + + +def remove_regions_below_population_threshold( + tb: Table, regions: dict, ds_population: Dataset, threshold: float +) -> Table: + """ + Check the share of population covered by the regions in the table. + """ + msk = tb["country"].isin(regions.keys()) + tb_region = tb[msk] + tb_no_regions = tb[~msk] + tb_region = geo.add_population_to_table(tb_region, ds_population, population_col="total_population") + tb_region["share_population"] = tb_region["population"] / tb_region["total_population"] + tb_region = tb_region[tb_region["share_population"] >= threshold] + tb_region = tb_region.drop(columns=["total_population", "share_population"]) + tb = pr.concat([tb_region, tb_no_regions]) + return tb def run(dest_dir: str) -> None: # # Load inputs. # - # Load meadow dataset, previous years and population data. + # Load datasets: meadow dataset (latest happiness report), previous years, happiness by ages + # for regional aggregates: population dataset, regions dataset, income groups dataset ds_meadow = paths.load_dataset("happiness", version="2024-06-09") ds_prev_years = paths.load_dataset("happiness", channel="garden", version="2023-03-20") - ds_population = paths.load_dataset("population", channel="garden") + ds_happiness_ages = paths.load_dataset("happiness_ages") - # Load regions dataset. + ds_population = paths.load_dataset("population", channel="garden") ds_regions = paths.load_dataset("regions") - - # Load income groups dataset. ds_income_groups = paths.load_dataset("income_groups") # Read table datasets. @@ -37,43 +53,94 @@ def run(dest_dir: str) -> None: cols_overlap = ["country", "cantril_ladder_score", "year"] tb = pr.concat([tb_this_year[cols_overlap], tb_prev_years], ignore_index=True) + # Read table including happiness by age group + tb_ages = ds_happiness_ages["happiness_ages"].reset_index() + # Harmonize country names tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb_ages = geo.harmonize_countries( + df=tb_ages, countries_file=paths.country_mapping_path, warn_on_unused_countries=False + ) + # Process happiness by age group data + # drop unneeded columns from age table + tb_ages = tb_ages.drop( + columns=[ + "region", + "age_group_code", + "stress_score", + "worry_score", + "happiness_count", + "stress_count", + "worry_count", + ] + ) + # remove leading "Age " from age_group + tb_ages["age_group"] = tb_ages["age_group"].str.replace("Age ", "") - # Process data (add population weighted averages for continents & income groups) + # standardize happiness by age group and happiness data + tb["age_group"] = "all ages" + tb["happiness_score"] = tb["cantril_ladder_score"] + tb = tb.drop(columns=["cantril_ladder_score"]) + # + # Add population weighted averages for continents & income groups) + # # save data of Northern Cyrpus and Somaliland to concat later (they do not have population in population dataset) countries_no_pop_msk = tb["country"].isin(["Northern Cyprus", "Somaliland"]) tb_countries_wo_population = tb[countries_no_pop_msk] tb = tb[~countries_no_pop_msk] - # add population to table + # add population to tables tb = geo.add_population_to_table(tb, ds_population) + tb_ages = geo.add_population_to_table(tb_ages, ds_population) # calculate population weighted averages by multiplying the population with the cantril ladder score # and then summing and dividing by the total population - tb["cantril_times_pop"] = tb["cantril_ladder_score"] * tb["population"] + tb["happiness_times_pop"] = tb["happiness_score"] * tb["population"] + tb_ages["happiness_times_pop"] = tb_ages["happiness_score"] * tb_ages["population"] + + # set population to NaN where happiness_score is NaN + tb_ages["population"] = tb_ages["population"].where(~tb_ages["happiness_score"].isna(), other=None) - aggr_score = {"cantril_times_pop": "sum", "population": "sum"} + aggr_score = {"happiness_times_pop": "sum", "population": "sum"} tb = geo.add_regions_to_table( tb, aggregations=aggr_score, - regions=REGIONS, + regions=ALL_REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + index_columns=["country", "year", "age_group"], + min_num_values_per_year=1, + ) + + tb_ages = geo.add_regions_to_table( + tb_ages, + aggregations=aggr_score, + regions=ALL_REGIONS, ds_regions=ds_regions, ds_income_groups=ds_income_groups, + index_columns=["country", "year", "age_group"], min_num_values_per_year=1, ) + # For happiness by age group, remove all regions where less than 50% of the population is covered + tb_ages = remove_regions_below_population_threshold(tb_ages, ALL_REGIONS, ds_population, threshold=0.5) + # Divide the sum of the cantril ladder score times population by the total population - tb["cantril_ladder_score"] = tb["cantril_times_pop"] / tb["population"] + # concatenate the two tables + tb = pr.concat([tb, tb_ages], ignore_index=True) + tb["happiness_score"] = tb["happiness_times_pop"] / tb["population"] # drop unneeded columns - tb = tb.drop(columns=["cantril_times_pop"]) + tb = tb.drop(columns=["happiness_times_pop"]) # add back Northern Cyprus and Somaliland tb = pr.concat([tb, tb_countries_wo_population], ignore_index=True) - tb = tb.format(["country", "year"]) + # drop population + tb = tb.drop(columns=["population"]) + + tb = tb.format(["country", "year", "age_group"]) # Save outputs. # diff --git a/etl/steps/data/grapher/happiness/2024-06-09/happiness.py b/etl/steps/data/grapher/happiness/2024-06-09/happiness.py index e2afb7700ca..95d875888b0 100644 --- a/etl/steps/data/grapher/happiness/2024-06-09/happiness.py +++ b/etl/steps/data/grapher/happiness/2024-06-09/happiness.py @@ -1,5 +1,4 @@ """Load a garden dataset and create a grapher dataset.""" - from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -19,8 +18,9 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new grapher dataset with the same metadata as the garden dataset. + # origins get added in grapher dataset, so do not warn about missing origins. ds_grapher = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + dest_dir, tables=[tb], check_variables_metadata=False, default_metadata=ds_garden.metadata ) # Save changes in the new grapher dataset. diff --git a/etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py b/etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py new file mode 100644 index 00000000000..43a2684c342 --- /dev/null +++ b/etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py @@ -0,0 +1,46 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +COLUMN_MAPPING = { + "Country name": "country", + "year": "year", + "Region indicator": "region", + "Age group code": "age_group_code", + "Age group": "age_group", + "Mean of ladder": "happiness_score", + "Mean of stress": "stress_score", + "Mean of worry": "worry_score", + "Count of ladder": "happiness_count", + "Count of stress": "stress_count", + "Count of worry": "worry_count", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("happiness_ages.xls") + + # Load data from snapshot. + tb = snap.read() + + # rename columns + tb = tb.rename(columns=COLUMN_MAPPING, errors="raise") + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "age_group"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/happiness/2024-06-20/happiness_ages.py b/snapshots/happiness/2024-06-20/happiness_ages.py new file mode 100644 index 00000000000..0c25a52c2e2 --- /dev/null +++ b/snapshots/happiness/2024-06-20/happiness_ages.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"happiness/{SNAPSHOT_VERSION}/happiness_ages.xls") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/happiness/2024-06-20/happiness_ages.xls.dvc b/snapshots/happiness/2024-06-20/happiness_ages.xls.dvc new file mode 100644 index 00000000000..68d1e6bf9da --- /dev/null +++ b/snapshots/happiness/2024-06-20/happiness_ages.xls.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: World Happiness Report + description: |- + The World Happiness Report is a partnership of Gallup, the Oxford Wellbeing Research Centre, the UN Sustainable Development Solutions Network, and the WHR’s Editorial Board. + It reviews the state of happiness in the world today and shows how the science of happiness explains personal and national variations in happiness. + date_published: "2024-03-08" + version_producer: 2024 + title_snapshot: World Happiness Report - Happiness by age group + description_snapshot: |- + Happiness measured for different age groups for each country; Age groups are under 30 years old, 30-44 years old, 45-59 years old and 60 years old and above. + + # Citation + producer: Wellbeing Research Centre + citation_full: |- + Helliwell, J. F., Layard, R., Sachs, J. D., De Neve, J.-E., Aknin, L. B., & Wang, S. (Eds.). (2024). World Happiness Report 2024. University of Oxford: Wellbeing Research Centre. + attribution_short: WHR + + # Files + url_main: https://worldhappiness.report/ed/2024/ + date_accessed: 2024-06-20 + + # License + license: + name: "" + url: https://worldhappiness.report/ed/2024/ + +outs: + - md5: 512ec7ed6d2c698c1474c35adedc402e + size: 705536 + path: happiness_ages.xls