owid · antea04 · Jun 20, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/dag/main.yml b/dag/main.yml
@@ -271,13 +271,17 @@ steps:
 
   data://garden/happiness/2024-06-09/happiness:
     - data://meadow/happiness/2024-06-09/happiness
+    - data://meadow/happiness/2024-06-20/happiness_ages
     - data://garden/happiness/2023-03-20/happiness
     - data://garden/demography/2023-03-31/population
     - data://garden/regions/2023-01-01/regions
     - data://garden/wb/2024-03-11/income_groups
   data://grapher/happiness/2024-06-09/happiness:
     - data://garden/happiness/2024-06-09/happiness
 
+  data://meadow/happiness/2024-06-20/happiness_ages:
+    - snapshot://happiness/2024-06-20/happiness_ages.xls
+
 
   # LGBTI Policy Index (Velasco, 2020)
   data://meadow/lgbt_rights/2023-04-27/lgbti_policy_index:

diff --git a/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml b/etl/steps/data/garden/happiness/2024-06-09/happiness.meta.yml
@@ -13,7 +13,7 @@ dataset:
 tables:
   happiness:
     variables:
-      cantril_ladder_score:
+      happiness_score:
         title: Cantril ladder score
         unit: ""
         short_unit: ""

diff --git a/etl/steps/data/garden/happiness/2024-06-09/happiness.py b/etl/steps/data/garden/happiness/2024-06-09/happiness.py
@@ -9,23 +9,22 @@
 paths = PathFinder(__file__)
 
 
-REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"}
-REGIONS.update({"World": {}})
+ALL_REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"}
+ALL_REGIONS.update({"World": {}})
 
 
 def run(dest_dir: str) -> None:
     #
     # Load inputs.
     #
-    # Load meadow dataset, previous years and population data.
+    # Load datasets: meadow dataset (latest happiness report), previous years, happiness by ages
+    # for regional aggregates: population dataset, regions dataset, income groups dataset
     ds_meadow = paths.load_dataset("happiness", version="2024-06-09")
     ds_prev_years = paths.load_dataset("happiness", channel="garden", version="2023-03-20")
-    ds_population = paths.load_dataset("population", channel="garden")
+    ds_happiness_ages = paths.load_dataset("happiness_ages")
 
-    # Load regions dataset.
+    ds_population = paths.load_dataset("population", channel="garden")
     ds_regions = paths.load_dataset("regions")
-
-    # Load income groups dataset.
     ds_income_groups = paths.load_dataset("income_groups")
 
     # Read table datasets.
@@ -37,43 +36,95 @@ def run(dest_dir: str) -> None:
     cols_overlap = ["country", "cantril_ladder_score", "year"]
     tb = pr.concat([tb_this_year[cols_overlap], tb_prev_years], ignore_index=True)
 
+    # Read table including happiness by age group
+    tb_ages = ds_happiness_ages["happiness_ages"].reset_index()
+
     # Harmonize country names
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
+    tb_ages = geo.harmonize_countries(
+        df=tb_ages, countries_file=paths.country_mapping_path, warn_on_unused_countries=False
+    )
+    # Process happiness by age group data
+    # drop unneeded columns from age table
+    tb_ages = tb_ages.drop(
+        columns=[
+            "region",
+            "age_group_code",
+            "stress_score",
+            "worry_score",
+            "happiness_count",
+            "stress_count",
+            "worry_count",
+        ]
+    )
+    # remove leading "Age " from age_group
+    tb_ages["age_group"] = tb_ages["age_group"].str.replace("Age ", "")
 
-    # Process data (add population weighted averages for continents & income groups)
+    # standardize happiness by age group and happiness data
+    tb["age_group"] = "all ages"
+    tb["happiness_score"] = tb["cantril_ladder_score"]
+    tb = tb.drop(columns=["cantril_ladder_score"])
 
+    #
+    # Add population weighted averages for continents & income groups)
+    #
     # save data of Northern Cyrpus and Somaliland to concat later (they do not have population in population dataset)
     countries_no_pop_msk = tb["country"].isin(["Northern Cyprus", "Somaliland"])
     tb_countries_wo_population = tb[countries_no_pop_msk]
     tb = tb[~countries_no_pop_msk]
 
-    # add population to table
+    # add population to tables
     tb = geo.add_population_to_table(tb, ds_population)
+    tb_ages = geo.add_population_to_table(tb_ages, ds_population)
 
     # calculate population weighted averages by multiplying the population with the cantril ladder score
     # and then summing and dividing by the total population
-    tb["cantril_times_pop"] = tb["cantril_ladder_score"] * tb["population"]
+    tb["happiness_times_pop"] = tb["happiness_score"] * tb["population"]
+    tb_ages["happiness_times_pop"] = tb_ages["happiness_score"] * tb_ages["population"]
 
-    aggr_score = {"cantril_times_pop": "sum", "population": "sum"}
+    # set population to NaN where happiness_score is NaN
+    tb["population"] = tb["population"].where(~tb["happiness_score"].isna(), other=None)
+    tb_ages["population"] = tb_ages["population"].where(~tb_ages["happiness_score"].isna(), other=None)
+
+    aggr_score = {"happiness_times_pop": "sum", "population": "sum"}
     tb = geo.add_regions_to_table(
         tb,
         aggregations=aggr_score,
-        regions=REGIONS,
+        regions=ALL_REGIONS,
+        ds_regions=ds_regions,
+        ds_income_groups=ds_income_groups,
+        index_columns=["country", "year", "age_group"],
+        min_num_values_per_year=1,
+    )
+
+    # For happiness by age group, remove all regions where less than 50% of the population is covered
+    # Manual check: Africa and Low income regions are not sufficiently covered
+    regions_for_age_groups = {
+        reg: reg_dict for reg, reg_dict in ALL_REGIONS.items() if reg not in ["Africa", "Low-income countries"]
+    }
+
+    tb_ages = geo.add_regions_to_table(
+        tb_ages,
+        aggregations=aggr_score,
+        regions=regions_for_age_groups,
         ds_regions=ds_regions,
         ds_income_groups=ds_income_groups,
+        index_columns=["country", "year", "age_group"],
         min_num_values_per_year=1,
     )
 
     # Divide the sum of the cantril ladder score times population by the total population
-    tb["cantril_ladder_score"] = tb["cantril_times_pop"] / tb["population"]
+    # concatenate the two tables
+    tb = pr.concat([tb, tb_ages], ignore_index=True)
+    tb["happiness_score"] = tb["happiness_times_pop"] / tb["population"]
 
     # drop unneeded columns
-    tb = tb.drop(columns=["cantril_times_pop"])
+    tb = tb.drop(columns=["happiness_times_pop"])
 
     # add back Northern Cyprus and Somaliland
     tb = pr.concat([tb, tb_countries_wo_population], ignore_index=True)
 
-    tb = tb.format(["country", "year"])
+    tb = tb.format(["country", "year", "age_group"])
 
     # Save outputs.
     #

diff --git a/etl/steps/data/grapher/happiness/2024-06-09/happiness.meta.yml b/etl/steps/data/grapher/happiness/2024-06-09/happiness.meta.yml
@@ -0,0 +1,76 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Happiness & Life Satisfaction
+      attribution_short: WHR
+    display:
+      numDecimalPlaces: 2
+    processing_level: major
+    unit: ""
+    short_unit: ""
+    origins:
+        # Data product / Snapshot
+      - title: World Happiness Report
+        description: |-
+          The World Happiness Report is a partnership of Gallup, the Oxford Wellbeing Research Centre, the UN Sustainable Development Solutions Network, and the WHR’s Editorial Board.
+          It reviews the state of happiness in the world today and shows how the science of happiness explains personal and national variations in happiness.
+        date_published: "2024-03-08"
+        version_producer: 2024
+        # Citation
+        producer: Wellbeing Research Centre
+        citation_full: |-
+          Helliwell, J. F., Layard, R., Sachs, J. D., De Neve, J.-E., Aknin, L. B., & Wang, S. (Eds.). (2024). World Happiness Report 2024. University of Oxford: Wellbeing Research Centre.
+        attribution_short: WHR
+        # Files
+        url_main: https://worldhappiness.report/ed/2024/
+        date_accessed: 2024-06-20
+        # License
+        license:
+          name: ""
+          url: https://worldhappiness.report/ed/2024/
+    description_short:
+      "Self-reported life satisfaction is the answer to the question:
+      'Imagine a ladder with the best possible life being a 10, and the worst possible life being a 0.
+      Which step on that ladder would you say you stand on right now?'"
+    description_processing:
+      Average of regions is calculated by taking a population-weighted average over all countries within that region.
+      Since data per age group is not available for all countries, regional aggregates can sometimes differ.
+    description_key:
+      - The Cantril ladder asks respondents to think of a ladder, with the best possible life for them being a 10 and the worst possible life being a 0. They are then asked to rate their own current lives on that 0 to 10 scale.
+      - The rankings are calculated by the source based on nationally representative samples for the three years prior to the year of the report, so that data for the 2024 report will draw from survey data from 2021-2023. We show the data for final year of the three-year survey period, i.e. we show the 2021-2023 survey data as 2023.
+      - The only exception is the data for the 2012 report, which uses survey data from 2005-2011, we show this data as the final year of the survey data - 2011.
+      - The number of people and countries surveyed varies year to year, but typically more than 100,000 people in 130 countries participate in the Gallup World Poll each year.
+      - The rankings are based entirely on the survey scores, using the Gallup weights to make the estimates representative.
+      - The data is the compilation of all previous World Happiness Reports, which can be found at https://worldhappiness.report/archive/.
+
+
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+
+tables:
+  happiness:
+    variables:
+      happiness_below_30:
+        title: Life satisfaction of people below 30
+      happiness_30_to_44:
+        title: Life satisfaction of people aged 30-44
+      happiness_45_to_59:
+        title: Life satisfaction of people aged 45-59
+      happiness_60_and_above:
+        title: Life satisfaction of people aged 60 and above
+      happiness_all_ages:
+        title: Cantril ladder score
+        description_short: Average of survey responses to the 'Cantril Ladder' question in the Gallup World Poll. The survey question asks respondents to think of a ladder, with the best possible life for them being a 10, and the worst possible life being a 0.
+        presentation:
+          attribution: World Happiness Report (2012-2024)
+          attribution_short: WHR
+          title_public: Self-reported life satisfaction
+        display:
+          numDecimalPlaces: 2
diff --git a/etl/steps/data/grapher/happiness/2024-06-09/happiness.py b/etl/steps/data/grapher/happiness/2024-06-09/happiness.py
@@ -1,10 +1,51 @@
 """Load a garden dataset and create a grapher dataset."""
+import pandas as pd
+from owid.catalog import Table
 
 from etl.helpers import PathFinder, create_dataset
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
 
+AGES_COLUMNS = [
+    "happiness_below_30",
+    "happiness_30_to_44",
+    "happiness_45_to_59",
+    "happiness_60_and_above",
+    "happiness_all_ages",
+]
+
+
+def pivot_age_groups(tb: Table):
+    new_tb_rows = []
+    for cty in tb["country"].unique():
+        cty_tb = tb[tb["country"] == cty]
+        for year in cty_tb["year"].unique():
+            new_row_dict = {"country": cty, "year": year}
+            row_tb = cty_tb[cty_tb["year"] == year]
+            ages = ["below 30", "30-44", "45-59", "60 and above", "all ages"]
+            for idx in range(len(ages)):
+                age_entry = ages[idx]
+                age_column = AGES_COLUMNS[idx]
+                age_row = row_tb[row_tb["age_group"] == age_entry]
+                if len(age_row) == 0:
+                    new_row_dict[age_column] = pd.NA
+                else:
+                    new_row_dict[age_column] = age_row["happiness_score"].values[0]
+            new_tb_rows.append(new_row_dict)
+    tb_pivot = Table(
+        pd.DataFrame(
+            new_tb_rows,
+            columns=[
+                "country",
+                "year",
+            ]
+            + AGES_COLUMNS,
+        )
+    )
+    tb_pivot = tb_pivot.copy_metadata(tb)
+    return tb_pivot
+
 
 def run(dest_dir: str) -> None:
     #
@@ -14,13 +55,24 @@ def run(dest_dir: str) -> None:
     ds_garden = paths.load_dataset("happiness")
 
     # Read table from garden dataset.
-    tb = ds_garden["happiness"]
+    tb = ds_garden["happiness"].reset_index()
+
+    # pivot table
+    tb = tb.drop(columns=["population"])
+
+    tb = pivot_age_groups(tb)
+
+    for age_col in AGES_COLUMNS:
+        tb[age_col] = tb[age_col].astype("Float64")
+
+    tb = tb.format(["country", "year"])
 
     # Save outputs.
     #
     # Create a new grapher dataset with the same metadata as the garden dataset.
+    # origins get added in grapher dataset, so do not warn about missing origins.
     ds_grapher = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+        dest_dir, tables=[tb], check_variables_metadata=False, default_metadata=ds_garden.metadata
     )
 
     # Save changes in the new grapher dataset.

diff --git a/etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py b/etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py
@@ -0,0 +1,46 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+COLUMN_MAPPING = {
+    "Country name": "country",
+    "year": "year",
+    "Region indicator": "region",
+    "Age group code": "age_group_code",
+    "Age group": "age_group",
+    "Mean of ladder": "happiness_score",
+    "Mean of stress": "stress_score",
+    "Mean of worry": "worry_score",
+    "Count of ladder": "happiness_count",
+    "Count of stress": "stress_count",
+    "Count of worry": "worry_count",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("happiness_ages.xls")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    # rename columns
+    tb = tb.rename(columns=COLUMN_MAPPING, errors="raise")
+
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["country", "year", "age_group"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/happiness/2024-06-20/happiness_ages.py b/snapshots/happiness/2024-06-20/happiness_ages.py
@@ -0,0 +1,25 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(path_to_file: str, upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"happiness/{SNAPSHOT_VERSION}/happiness_ages.xls")
+
+    # Copy local data file to snapshots data folder, add file to DVC and upload to S3.
+    snap.create_snapshot(filename=path_to_file, upload=upload)
+
+
+if __name__ == "__main__":
+    main()