Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 happiness: by age group #2867

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dag/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,13 +271,17 @@ steps:

data://garden/happiness/2024-06-09/happiness:
- data://meadow/happiness/2024-06-09/happiness
- data://meadow/happiness/2024-06-20/happiness_ages
- data://garden/happiness/2023-03-20/happiness
- data://garden/demography/2023-03-31/population
- data://garden/regions/2023-01-01/regions
- data://garden/wb/2024-03-11/income_groups
data://grapher/happiness/2024-06-09/happiness:
- data://garden/happiness/2024-06-09/happiness

data://meadow/happiness/2024-06-20/happiness_ages:
- snapshot://happiness/2024-06-20/happiness_ages.xls


# LGBTI Policy Index (Velasco, 2020)
data://meadow/lgbt_rights/2023-04-27/lgbti_policy_index:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dataset:
tables:
happiness:
variables:
cantril_ladder_score:
happiness_score:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use some Jinja templating here to add metadata to specific age-groups, if you want. Using this you can add index values into the metadata. Here is a slightly complicated example of how you can do that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would help with the question you had about pivoting in the grapher step.

title: Cantril ladder score
unit: ""
short_unit: ""
Expand Down
81 changes: 66 additions & 15 deletions etl/steps/data/garden/happiness/2024-06-09/happiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,22 @@
paths = PathFinder(__file__)


REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"}
REGIONS.update({"World": {}})
ALL_REGIONS = {reg: reg_dict for reg, reg_dict in geo.REGIONS.items() if reg != "European Union (27)"}
ALL_REGIONS.update({"World": {}})


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset, previous years and population data.
# Load datasets: meadow dataset (latest happiness report), previous years, happiness by ages
# for regional aggregates: population dataset, regions dataset, income groups dataset
ds_meadow = paths.load_dataset("happiness", version="2024-06-09")
ds_prev_years = paths.load_dataset("happiness", channel="garden", version="2023-03-20")
ds_population = paths.load_dataset("population", channel="garden")
ds_happiness_ages = paths.load_dataset("happiness_ages")

# Load regions dataset.
ds_population = paths.load_dataset("population", channel="garden")
ds_regions = paths.load_dataset("regions")

# Load income groups dataset.
ds_income_groups = paths.load_dataset("income_groups")

# Read table datasets.
Expand All @@ -37,43 +36,95 @@ def run(dest_dir: str) -> None:
cols_overlap = ["country", "cantril_ladder_score", "year"]
tb = pr.concat([tb_this_year[cols_overlap], tb_prev_years], ignore_index=True)

# Read table including happiness by age group
tb_ages = ds_happiness_ages["happiness_ages"].reset_index()

# Harmonize country names
tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
tb_ages = geo.harmonize_countries(
df=tb_ages, countries_file=paths.country_mapping_path, warn_on_unused_countries=False
)
# Process happiness by age group data
# drop unneeded columns from age table
tb_ages = tb_ages.drop(
columns=[
"region",
"age_group_code",
"stress_score",
"worry_score",
"happiness_count",
"stress_count",
"worry_count",
]
)
# remove leading "Age " from age_group
tb_ages["age_group"] = tb_ages["age_group"].str.replace("Age ", "")

# Process data (add population weighted averages for continents & income groups)
# standardize happiness by age group and happiness data
tb["age_group"] = "all ages"
tb["happiness_score"] = tb["cantril_ladder_score"]
tb = tb.drop(columns=["cantril_ladder_score"])

#
# Add population weighted averages for continents & income groups)
#
# save data of Northern Cyrpus and Somaliland to concat later (they do not have population in population dataset)
countries_no_pop_msk = tb["country"].isin(["Northern Cyprus", "Somaliland"])
tb_countries_wo_population = tb[countries_no_pop_msk]
tb = tb[~countries_no_pop_msk]

# add population to table
# add population to tables
tb = geo.add_population_to_table(tb, ds_population)
tb_ages = geo.add_population_to_table(tb_ages, ds_population)

# calculate population weighted averages by multiplying the population with the cantril ladder score
# and then summing and dividing by the total population
tb["cantril_times_pop"] = tb["cantril_ladder_score"] * tb["population"]
tb["happiness_times_pop"] = tb["happiness_score"] * tb["population"]
tb_ages["happiness_times_pop"] = tb_ages["happiness_score"] * tb_ages["population"]

aggr_score = {"cantril_times_pop": "sum", "population": "sum"}
# set population to NaN where happiness_score is NaN
tb["population"] = tb["population"].where(~tb["happiness_score"].isna(), other=None)
antea04 marked this conversation as resolved.
Show resolved Hide resolved
tb_ages["population"] = tb_ages["population"].where(~tb_ages["happiness_score"].isna(), other=None)

aggr_score = {"happiness_times_pop": "sum", "population": "sum"}
tb = geo.add_regions_to_table(
tb,
aggregations=aggr_score,
regions=REGIONS,
regions=ALL_REGIONS,
ds_regions=ds_regions,
ds_income_groups=ds_income_groups,
index_columns=["country", "year", "age_group"],
min_num_values_per_year=1,
)

# For happiness by age group, remove all regions where less than 50% of the population is covered
# Manual check: Africa and Low income regions are not sufficiently covered
antea04 marked this conversation as resolved.
Show resolved Hide resolved
regions_for_age_groups = {
reg: reg_dict for reg, reg_dict in ALL_REGIONS.items() if reg not in ["Africa", "Low-income countries"]
}

tb_ages = geo.add_regions_to_table(
tb_ages,
aggregations=aggr_score,
regions=regions_for_age_groups,
ds_regions=ds_regions,
ds_income_groups=ds_income_groups,
index_columns=["country", "year", "age_group"],
min_num_values_per_year=1,
)

# Divide the sum of the cantril ladder score times population by the total population
tb["cantril_ladder_score"] = tb["cantril_times_pop"] / tb["population"]
# concatenate the two tables
tb = pr.concat([tb, tb_ages], ignore_index=True)
tb["happiness_score"] = tb["happiness_times_pop"] / tb["population"]

# drop unneeded columns
tb = tb.drop(columns=["cantril_times_pop"])
tb = tb.drop(columns=["happiness_times_pop"])

# add back Northern Cyprus and Somaliland
tb = pr.concat([tb, tb_countries_wo_population], ignore_index=True)

tb = tb.format(["country", "year"])
tb = tb.format(["country", "year", "age_group"])

# Save outputs.
#
Expand Down
76 changes: 76 additions & 0 deletions etl/steps/data/grapher/happiness/2024-06-09/happiness.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Happiness & Life Satisfaction
attribution_short: WHR
display:
numDecimalPlaces: 2
processing_level: major
unit: ""
short_unit: ""
origins:
# Data product / Snapshot
- title: World Happiness Report
description: |-
The World Happiness Report is a partnership of Gallup, the Oxford Wellbeing Research Centre, the UN Sustainable Development Solutions Network, and the WHR’s Editorial Board.
It reviews the state of happiness in the world today and shows how the science of happiness explains personal and national variations in happiness.
date_published: "2024-03-08"
version_producer: 2024
# Citation
producer: Wellbeing Research Centre
citation_full: |-
Helliwell, J. F., Layard, R., Sachs, J. D., De Neve, J.-E., Aknin, L. B., & Wang, S. (Eds.). (2024). World Happiness Report 2024. University of Oxford: Wellbeing Research Centre.
attribution_short: WHR
# Files
url_main: https://worldhappiness.report/ed/2024/
date_accessed: 2024-06-20
# License
license:
name: ""
url: https://worldhappiness.report/ed/2024/
description_short:
"Self-reported life satisfaction is the answer to the question:
'Imagine a ladder with the best possible life being a 10, and the worst possible life being a 0.
Which step on that ladder would you say you stand on right now?'"
description_processing:
Average of regions is calculated by taking a population-weighted average over all countries within that region.
Since data per age group is not available for all countries, regional aggregates can sometimes differ.
description_key:
- The Cantril ladder asks respondents to think of a ladder, with the best possible life for them being a 10 and the worst possible life being a 0. They are then asked to rate their own current lives on that 0 to 10 scale.
- The rankings are calculated by the source based on nationally representative samples for the three years prior to the year of the report, so that data for the 2024 report will draw from survey data from 2021-2023. We show the data for final year of the three-year survey period, i.e. we show the 2021-2023 survey data as 2023.
- The only exception is the data for the 2012 report, which uses survey data from 2005-2011, we show this data as the final year of the survey data - 2011.
- The number of people and countries surveyed varies year to year, but typically more than 100,000 people in 130 countries participate in the Gallup World Poll each year.
- The rankings are based entirely on the survey scores, using the Gallup weights to make the estimates representative.
- The data is the compilation of all previous World Happiness Reports, which can be found at https://worldhappiness.report/archive/.




# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365


tables:
happiness:
variables:
happiness_below_30:
title: Life satisfaction of people below 30
happiness_30_to_44:
title: Life satisfaction of people aged 30-44
happiness_45_to_59:
title: Life satisfaction of people aged 45-59
happiness_60_and_above:
title: Life satisfaction of people aged 60 and above
happiness_all_ages:
title: Cantril ladder score
description_short: Average of survey responses to the 'Cantril Ladder' question in the Gallup World Poll. The survey question asks respondents to think of a ladder, with the best possible life for them being a 10, and the worst possible life being a 0.
presentation:
attribution: World Happiness Report (2012-2024)
attribution_short: WHR
title_public: Self-reported life satisfaction
display:
numDecimalPlaces: 2
56 changes: 54 additions & 2 deletions etl/steps/data/grapher/happiness/2024-06-09/happiness.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,51 @@
"""Load a garden dataset and create a grapher dataset."""
import pandas as pd
from owid.catalog import Table

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

AGES_COLUMNS = [
"happiness_below_30",
"happiness_30_to_44",
"happiness_45_to_59",
"happiness_60_and_above",
"happiness_all_ages",
]


def pivot_age_groups(tb: Table):
new_tb_rows = []
for cty in tb["country"].unique():
cty_tb = tb[tb["country"] == cty]
for year in cty_tb["year"].unique():
new_row_dict = {"country": cty, "year": year}
row_tb = cty_tb[cty_tb["year"] == year]
ages = ["below 30", "30-44", "45-59", "60 and above", "all ages"]
for idx in range(len(ages)):
age_entry = ages[idx]
age_column = AGES_COLUMNS[idx]
age_row = row_tb[row_tb["age_group"] == age_entry]
if len(age_row) == 0:
new_row_dict[age_column] = pd.NA
else:
new_row_dict[age_column] = age_row["happiness_score"].values[0]
new_tb_rows.append(new_row_dict)
tb_pivot = Table(
pd.DataFrame(
new_tb_rows,
columns=[
"country",
"year",
]
+ AGES_COLUMNS,
)
)
tb_pivot = tb_pivot.copy_metadata(tb)
return tb_pivot


def run(dest_dir: str) -> None:
#
Expand All @@ -14,13 +55,24 @@ def run(dest_dir: str) -> None:
ds_garden = paths.load_dataset("happiness")

# Read table from garden dataset.
tb = ds_garden["happiness"]
tb = ds_garden["happiness"].reset_index()

# pivot table
tb = tb.drop(columns=["population"])

tb = pivot_age_groups(tb)

for age_col in AGES_COLUMNS:
tb[age_col] = tb[age_col].astype("Float64")

tb = tb.format(["country", "year"])
antea04 marked this conversation as resolved.
Show resolved Hide resolved

# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
# origins get added in grapher dataset, so do not warn about missing origins.
ds_grapher = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
dest_dir, tables=[tb], check_variables_metadata=False, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
Expand Down
46 changes: 46 additions & 0 deletions etl/steps/data/meadow/happiness/2024-06-20/happiness_ages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

COLUMN_MAPPING = {
"Country name": "country",
"year": "year",
"Region indicator": "region",
"Age group code": "age_group_code",
"Age group": "age_group",
"Mean of ladder": "happiness_score",
"Mean of stress": "stress_score",
"Mean of worry": "worry_score",
"Count of ladder": "happiness_count",
"Count of stress": "stress_count",
"Count of worry": "worry_count",
}


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("happiness_ages.xls")

# Load data from snapshot.
tb = snap.read()

# rename columns
tb = tb.rename(columns=COLUMN_MAPPING, errors="raise")

# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(["country", "year", "age_group"])

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
ds_meadow.save()
25 changes: 25 additions & 0 deletions snapshots/happiness/2024-06-20/happiness_ages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
def main(path_to_file: str, upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"happiness/{SNAPSHOT_VERSION}/happiness_ages.xls")

# Copy local data file to snapshots data folder, add file to DVC and upload to S3.
snap.create_snapshot(filename=path_to_file, upload=upload)


if __name__ == "__main__":
main()
Loading
Loading