Add YouGov data on dietary choices of Brits (#2480)

* Add snapshot, meadow, garden and grapher steps for YouGov data on dietary choices of Brits * Improve metadata and adapt dates for grapher * Set number of decimals
owid · Apr 1, 2024 · 0dca90d · 0dca90d
1 parent c7a88ee
commit 0dca90d
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 0 deletions.
diff --git a/dag/main.yml b/dag/main.yml
@@ -751,3 +751,4 @@ include:
   - dag/poverty_inequality.yml
   - dag/democracy.yml
   - dag/temp.yml
+  - dag/survey.yml
diff --git a/dag/survey.yml b/dag/survey.yml
@@ -0,0 +1,14 @@
+steps:
+  #
+  # YouGov - Dietary choices of Brits.
+  #
+  data://meadow/survey/2024-04-01/dietary_choices_uk:
+    - snapshot://survey/2024-04-01/dietary_choices_uk.xlsx
+  data://garden/survey/2024-04-01/dietary_choices_uk:
+    - data://meadow/survey/2024-04-01/dietary_choices_uk
+  data://grapher/survey/2024-04-01/dietary_choices_uk:
+    - data://garden/survey/2024-04-01/dietary_choices_uk
+
+  ######################################################################################################################
+  # Older versions to be archived once they are not used by any other steps.
+  ######################################################################################################################
diff --git a/etl/steps/data/garden/survey/2024-04-01/dietary_choices_uk.meta.yml b/etl/steps/data/garden/survey/2024-04-01/dietary_choices_uk.meta.yml
@@ -0,0 +1,94 @@
+definitions:
+  common:
+    processing_level: minor
+    presentation:
+      attribution_short: YouGov
+      topic_tags:
+        - Diet Compositions
+        - Animal Welfare
+        - Food Supply
+    display:  &common-display
+      numDecimalPlaces: 0
+
+dataset:
+  update_period_days: 365
+
+tables:
+  dietary_choices_uk:
+    variables:
+      base:
+        title: Weighted number of responses
+        description_short: |-
+          Number of responses, after applying weights, of a particular group (e.g. "18-24" or "Female") and date.
+        unit: ""
+        short_unit: ""
+      base_unweighted:
+        title: Number of responses, before applying weights, of a particular group (e.g. "18-24" or "Female") and date.
+        unit: ""
+        short_unit: ""
+      meat_eater:
+        title: Percentage of meat eaters
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "Meat eater: eat meat and/or poultry".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: Meat eater
+          <<: *common-display
+        presentation:
+          title_public: Percentage of meat eaters
+      flexitarian:
+        title: Percentage of flexitarians
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "Flexitarian: mainly vegetarian, but occasionally eat meat or fish".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: Flexitarian
+          <<: *common-display
+        presentation:
+          title_public: Percentage of flexitarians
+      pescetarian:
+        title: Percentage of pescetarians
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "Pescetarian: eat fish but do not eat meat or poultry".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: Pescetarian
+          <<: *common-display
+        presentation:
+          title_public: Percentage of pescetarians
+      vegetarian:
+        title: Percentage of vegetarians
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "Vegetarian: do not eat any meat, poultry, game, fish, or shellfish".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: Vegetarian
+          <<: *common-display
+        presentation:
+          title_public: Percentage of vegetarians
+      vegan:
+        title: Percentage of vegans
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: Vegan
+          <<: *common-display
+        presentation:
+          title_public: Percentage of vegans
+      none:
+        title: Percentage of people with other diets
+        description_short: |-
+          Percentage of participants who responded to the question "Which of these best describes your diet?" with "None of these".
+        unit: "%"
+        short_unit: "%"
+        display:
+          name: None of these
+          <<: *common-display
+        presentation:
+          title_public: Percentage of people with other diets
diff --git a/etl/steps/data/garden/survey/2024-04-01/dietary_choices_uk.py b/etl/steps/data/garden/survey/2024-04-01/dietary_choices_uk.py
@@ -0,0 +1,77 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+COLUMNS = {
+    "Base": "base",
+    "Unweighted base": "base_unweighted",
+    "Flexitarian (mainly vegetarian, but occasionally eat meat or fish)": "flexitarian",
+    "Meat eater (eat meat and/or poultry)": "meat_eater",
+    "None of these": "none",
+    "Pescetarian (eat fish but do not eat meat or poultry)": "pescetarian",
+    "Plant-based / Vegan (do not eat dairy products, eggs, or any other animal product)": "vegan",
+    "Vegetarian (do not eat any meat, poultry, game, fish or shellfish)": "vegetarian",
+}
+
+
+def run_sanity_checks(tb: Table) -> None:
+    error = "Percentages do not add up to 100% for some of the surveyed dates (within 2%)."
+    assert (abs(tb.drop(columns=["base", "base_unweighted"]).sum(axis=1) - 100) <= 2).all(), error
+
+    error = "Negative values found in the table."
+    assert (tb >= 0).all().all(), error
+
+    error = "Base and unweighted base, on a given date, should add up to the same number (or at least within 1%)."
+    _tb = tb.groupby(["date"]).agg({"base": "sum", "base_unweighted": "sum"})
+    assert ((100 * abs(_tb["base"] - _tb["base_unweighted"]) / _tb["base_unweighted"]) < 1).all()
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("dietary_choices_uk")
+    tb = ds_meadow["dietary_choices_uk"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Rename diet column name for convenience.
+    tb = tb.rename(columns={"which_of_these_best_describes_your_diet": "diet"}, errors="raise")
+
+    # Rename diets.
+    tb["diet"] = tb["diet"].map(COLUMNS)
+
+    # Transform the table to long format.
+    tb = tb.melt(id_vars=["diet", "group"], var_name="date", value_name="value")
+
+    # Format date column.
+    tb["date"] = tb["date"].str[1:].str.replace("_", "-")
+
+    # Transform the table to wide format.
+    tb = tb.pivot(index=["group", "date"], columns="diet", values="value", join_column_levels_with="_")
+
+    # Convert fractions into percentages.
+    tb[tb.drop(columns=["group", "date", "base", "base_unweighted"]).columns] *= 100
+
+    # Ensure columns have the right type.
+    tb = tb.astype({"base": int, "base_unweighted": int})
+
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(keys=["group", "date"], sort_columns=True)
+
+    # Sanity checks on outputs.
+    run_sanity_checks(tb=tb)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/survey/2024-04-01/dietary_choices_uk.py b/etl/steps/data/grapher/survey/2024-04-01/dietary_choices_uk.py
@@ -0,0 +1,55 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+import pandas as pd
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Groups to select from the survey data.
+SELECTED_GROUPS = ["All adults", "18-24", "25-49", "50-64", "65+"]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset and read its main table.
+    ds_garden = paths.load_dataset("dietary_choices_uk")
+    tb = ds_garden["dietary_choices_uk"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Adapt table format to grapher requirements.
+    tb = tb.rename(columns={"group": "country", "date": "year"}, errors="raise").drop(
+        columns=["base", "base_unweighted"], errors="raise"
+    )
+
+    # Select only the groups that are going to be displayed in grapher.
+    tb = tb[tb["country"].isin(SELECTED_GROUPS)].reset_index(drop=True)
+
+    # Sanity check.
+    error = "A survey group may have been renamed."
+    assert set(tb["country"]) == set(SELECTED_GROUPS), error
+
+    # Prepare display metadata.
+    date_earliest = tb["year"].astype(str).min()
+    for column in tb.drop(columns=["country", "year"]).columns:
+        tb[column].metadata.display["yearIsDay"] = True
+        tb[column].metadata.display["zeroDay"] = date_earliest
+
+    # Convert year column into a number of days since the earliest date in the table.
+    tb["year"] = tb["year"].astype("datetime64")
+    tb["year"] = (tb["year"] - pd.to_datetime(date_earliest)).dt.days
+
+    # Ensure the table is well formatted.
+    tb = tb.format()
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/survey/2024-04-01/dietary_choices_uk.py b/etl/steps/data/meadow/survey/2024-04-01/dietary_choices_uk.py
@@ -0,0 +1,35 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot and read its main table.
+    snap = paths.load_snapshot("dietary_choices_uk.xlsx")
+    data = snap.ExcelFile()
+
+    #
+    # Process data.
+    #
+    # Combine all sheets into a single table.
+    tb = pr.concat(
+        [data.parse(sheet_name=sheet_name).assign(**{"group": sheet_name}) for sheet_name in data.sheet_names]
+    )
+
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["which_of_these_best_describes_your_diet", "group"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/snapshots/survey/2024-04-01/dietary_choices_uk.py b/snapshots/survey/2024-04-01/dietary_choices_uk.py
@@ -0,0 +1,24 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"survey/{SNAPSHOT_VERSION}/dietary_choices_uk.xlsx")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/survey/2024-04-01/dietary_choices_uk.xlsx.dvc b/snapshots/survey/2024-04-01/dietary_choices_uk.xlsx.dvc
@@ -0,0 +1,27 @@
+meta:
+  origin:
+    title: Dietary choices of Brits
+    description: |-
+      This dataset contains the result of YouGov surveys asking the question: "Which of these best describes your diet?". The available responses were:
+      - Meat eater: eat meat and/or poultry.
+      - Flexitarian: mainly vegetarian, but occasionally eat meat or fish.
+      - Pescetarian: eat fish but do not eat meat or poultry.
+      - Vegetarian: do not eat any meat, poultry, game, fish, or shellfish.
+      - Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product.
+      - None of these.
+    date_published: "2024-01-03"
+    producer: YouGov
+    citation_full: |-
+      YouGov (2024) - Dietary choices of Brits (e.g. vegeterian, flexitarian, meat-eater etc)?
+    attribution_short: YouGov
+    url_main: https://yougov.co.uk/topics/society/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc
+    url_download: https://yougov.co.uk/_pubapis/v5/uk/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc/download/
+    date_accessed: 2024-04-01
+    license:
+      name: Copyright © 2024 YouGov PLC
+      url: https://yougov.co.uk/about/terms
+
+outs:
+  - md5: 0c6c04daa40179fe76d305f61f4ae4f0
+    size: 40107
+    path: dietary_choices_uk.xlsx