Skip to content

Commit

Permalink
Add YouGov data on dietary choices of Brits (#2480)
Browse files Browse the repository at this point in the history
* Add snapshot, meadow, garden and grapher steps for YouGov data on dietary choices of Brits

* Improve metadata and adapt dates for grapher

* Set number of decimals
  • Loading branch information
pabloarosado authored Apr 1, 2024
1 parent c7a88ee commit 0dca90d
Show file tree
Hide file tree
Showing 8 changed files with 327 additions and 0 deletions.
1 change: 1 addition & 0 deletions dag/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -751,3 +751,4 @@ include:
- dag/poverty_inequality.yml
- dag/democracy.yml
- dag/temp.yml
- dag/survey.yml
14 changes: 14 additions & 0 deletions dag/survey.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
steps:
#
# YouGov - Dietary choices of Brits.
#
data://meadow/survey/2024-04-01/dietary_choices_uk:
- snapshot://survey/2024-04-01/dietary_choices_uk.xlsx
data://garden/survey/2024-04-01/dietary_choices_uk:
- data://meadow/survey/2024-04-01/dietary_choices_uk
data://grapher/survey/2024-04-01/dietary_choices_uk:
- data://garden/survey/2024-04-01/dietary_choices_uk

######################################################################################################################
# Older versions to be archived once they are not used by any other steps.
######################################################################################################################
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
definitions:
common:
processing_level: minor
presentation:
attribution_short: YouGov
topic_tags:
- Diet Compositions
- Animal Welfare
- Food Supply
display: &common-display
numDecimalPlaces: 0

dataset:
update_period_days: 365

tables:
dietary_choices_uk:
variables:
base:
title: Weighted number of responses
description_short: |-
Number of responses, after applying weights, of a particular group (e.g. "18-24" or "Female") and date.
unit: ""
short_unit: ""
base_unweighted:
title: Number of responses, before applying weights, of a particular group (e.g. "18-24" or "Female") and date.
unit: ""
short_unit: ""
meat_eater:
title: Percentage of meat eaters
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "Meat eater: eat meat and/or poultry".
unit: "%"
short_unit: "%"
display:
name: Meat eater
<<: *common-display
presentation:
title_public: Percentage of meat eaters
flexitarian:
title: Percentage of flexitarians
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "Flexitarian: mainly vegetarian, but occasionally eat meat or fish".
unit: "%"
short_unit: "%"
display:
name: Flexitarian
<<: *common-display
presentation:
title_public: Percentage of flexitarians
pescetarian:
title: Percentage of pescetarians
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "Pescetarian: eat fish but do not eat meat or poultry".
unit: "%"
short_unit: "%"
display:
name: Pescetarian
<<: *common-display
presentation:
title_public: Percentage of pescetarians
vegetarian:
title: Percentage of vegetarians
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "Vegetarian: do not eat any meat, poultry, game, fish, or shellfish".
unit: "%"
short_unit: "%"
display:
name: Vegetarian
<<: *common-display
presentation:
title_public: Percentage of vegetarians
vegan:
title: Percentage of vegans
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product".
unit: "%"
short_unit: "%"
display:
name: Vegan
<<: *common-display
presentation:
title_public: Percentage of vegans
none:
title: Percentage of people with other diets
description_short: |-
Percentage of participants who responded to the question "Which of these best describes your diet?" with "None of these".
unit: "%"
short_unit: "%"
display:
name: None of these
<<: *common-display
presentation:
title_public: Percentage of people with other diets
77 changes: 77 additions & 0 deletions etl/steps/data/garden/survey/2024-04-01/dietary_choices_uk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import Table

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

COLUMNS = {
"Base": "base",
"Unweighted base": "base_unweighted",
"Flexitarian (mainly vegetarian, but occasionally eat meat or fish)": "flexitarian",
"Meat eater (eat meat and/or poultry)": "meat_eater",
"None of these": "none",
"Pescetarian (eat fish but do not eat meat or poultry)": "pescetarian",
"Plant-based / Vegan (do not eat dairy products, eggs, or any other animal product)": "vegan",
"Vegetarian (do not eat any meat, poultry, game, fish or shellfish)": "vegetarian",
}


def run_sanity_checks(tb: Table) -> None:
error = "Percentages do not add up to 100% for some of the surveyed dates (within 2%)."
assert (abs(tb.drop(columns=["base", "base_unweighted"]).sum(axis=1) - 100) <= 2).all(), error

error = "Negative values found in the table."
assert (tb >= 0).all().all(), error

error = "Base and unweighted base, on a given date, should add up to the same number (or at least within 1%)."
_tb = tb.groupby(["date"]).agg({"base": "sum", "base_unweighted": "sum"})
assert ((100 * abs(_tb["base"] - _tb["base_unweighted"]) / _tb["base_unweighted"]) < 1).all()


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset and read its main table.
ds_meadow = paths.load_dataset("dietary_choices_uk")
tb = ds_meadow["dietary_choices_uk"].reset_index()

#
# Process data.
#
# Rename diet column name for convenience.
tb = tb.rename(columns={"which_of_these_best_describes_your_diet": "diet"}, errors="raise")

# Rename diets.
tb["diet"] = tb["diet"].map(COLUMNS)

# Transform the table to long format.
tb = tb.melt(id_vars=["diet", "group"], var_name="date", value_name="value")

# Format date column.
tb["date"] = tb["date"].str[1:].str.replace("_", "-")

# Transform the table to wide format.
tb = tb.pivot(index=["group", "date"], columns="diet", values="value", join_column_levels_with="_")

# Convert fractions into percentages.
tb[tb.drop(columns=["group", "date", "base", "base_unweighted"]).columns] *= 100

# Ensure columns have the right type.
tb = tb.astype({"base": int, "base_unweighted": int})

# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(keys=["group", "date"], sort_columns=True)

# Sanity checks on outputs.
run_sanity_checks(tb=tb)

#
# Save outputs.
#
# Create a new garden dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
ds_garden.save()
55 changes: 55 additions & 0 deletions etl/steps/data/grapher/survey/2024-04-01/dietary_choices_uk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Load a garden dataset and create a grapher dataset."""

import pandas as pd

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

# Groups to select from the survey data.
SELECTED_GROUPS = ["All adults", "18-24", "25-49", "50-64", "65+"]


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset and read its main table.
ds_garden = paths.load_dataset("dietary_choices_uk")
tb = ds_garden["dietary_choices_uk"].reset_index()

#
# Process data.
#
# Adapt table format to grapher requirements.
tb = tb.rename(columns={"group": "country", "date": "year"}, errors="raise").drop(
columns=["base", "base_unweighted"], errors="raise"
)

# Select only the groups that are going to be displayed in grapher.
tb = tb[tb["country"].isin(SELECTED_GROUPS)].reset_index(drop=True)

# Sanity check.
error = "A survey group may have been renamed."
assert set(tb["country"]) == set(SELECTED_GROUPS), error

# Prepare display metadata.
date_earliest = tb["year"].astype(str).min()
for column in tb.drop(columns=["country", "year"]).columns:
tb[column].metadata.display["yearIsDay"] = True
tb[column].metadata.display["zeroDay"] = date_earliest

# Convert year column into a number of days since the earliest date in the table.
tb["year"] = tb["year"].astype("datetime64")
tb["year"] = (tb["year"] - pd.to_datetime(date_earliest)).dt.days

# Ensure the table is well formatted.
tb = tb.format()

#
# Save outputs.
#
# Create a new grapher dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
ds_grapher.save()
35 changes: 35 additions & 0 deletions etl/steps/data/meadow/survey/2024-04-01/dietary_choices_uk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Load a snapshot and create a meadow dataset."""

import owid.catalog.processing as pr

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot and read its main table.
snap = paths.load_snapshot("dietary_choices_uk.xlsx")
data = snap.ExcelFile()

#
# Process data.
#
# Combine all sheets into a single table.
tb = pr.concat(
[data.parse(sheet_name=sheet_name).assign(**{"group": sheet_name}) for sheet_name in data.sheet_names]
)

# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(["which_of_these_best_describes_your_diet", "group"])

#
# Save outputs.
#
# Create a new meadow dataset.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
ds_meadow.save()
24 changes: 24 additions & 0 deletions snapshots/survey/2024-04-01/dietary_choices_uk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"survey/{SNAPSHOT_VERSION}/dietary_choices_uk.xlsx")

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload)


if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions snapshots/survey/2024-04-01/dietary_choices_uk.xlsx.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
meta:
origin:
title: Dietary choices of Brits
description: |-
This dataset contains the result of YouGov surveys asking the question: "Which of these best describes your diet?". The available responses were:
- Meat eater: eat meat and/or poultry.
- Flexitarian: mainly vegetarian, but occasionally eat meat or fish.
- Pescetarian: eat fish but do not eat meat or poultry.
- Vegetarian: do not eat any meat, poultry, game, fish, or shellfish.
- Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product.
- None of these.
date_published: "2024-01-03"
producer: YouGov
citation_full: |-
YouGov (2024) - Dietary choices of Brits (e.g. vegeterian, flexitarian, meat-eater etc)?
attribution_short: YouGov
url_main: https://yougov.co.uk/topics/society/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc
url_download: https://yougov.co.uk/_pubapis/v5/uk/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc/download/
date_accessed: 2024-04-01
license:
name: Copyright © 2024 YouGov PLC
url: https://yougov.co.uk/about/terms

outs:
- md5: 0c6c04daa40179fe76d305f61f4ae4f0
size: 40107
path: dietary_choices_uk.xlsx

0 comments on commit 0dca90d

Please sign in to comment.