diff --git a/dag/main.yml b/dag/main.yml index 44c2c297562..2c80ffed692 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -865,6 +865,17 @@ steps: data://grapher/wpf/2024-10-03/famines_by_place: - data://garden/wpf/2024-10-03/famines_by_place + data-private://meadow/owid/latest/population_explore: + - snapshot-private://owid/latest/population_explore.xlsx + data-private://garden/owid/latest/population_explore: + - data-private://meadow/owid/latest/population_explore + - data://garden/demography/2024-07-15/population + - data://garden/hyde/2017/baseline + - data://garden/gapminder/2019-12-10/population + - data://garden/un/2022-07-11/un_wpp + data-private://grapher/owid/latest/population_explore: + - data-private://garden/owid/latest/population_explore + data-private://meadow/owid/latest/ig_countries: - snapshot-private://owid/latest/ig_countries.csv data-private://garden/owid/latest/ig_countries: @@ -872,6 +883,7 @@ steps: data-private://grapher/owid/latest/ig_countries: - data-private://garden/owid/latest/ig_countries + include: - dag/open_numbers.yml - dag/faostat.yml diff --git a/etl/steps/data/garden/owid/latest/population_explore.countries.json b/etl/steps/data/garden/owid/latest/population_explore.countries.json new file mode 100644 index 00000000000..3acd3a047e2 --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.countries.json @@ -0,0 +1,232 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Channel Islands": "Channel Islands", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Faeroe Islands": "Faroe Islands", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Holy See": "Vatican", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Micronesia (Fed. States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Northern Mariana Islands": "Northern Mariana Islands", + "Norway": "Norway", + "Occupied Palestinian Territory": "Palestine", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "TFYR Macedonia": "North Macedonia", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Dem. People's Republic of Korea": "North Korea", + "Libyan Arab Jamahiriya": "Libya", + "Other non-specified areas": "Others" +} diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml new file mode 100644 index 00000000000..31a30b27521 --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -0,0 +1,53 @@ +dataset: + update_period_days: 0 + title: Population (Maddison exploration) + non_redistributable: true + +tables: + population_explore: + variables: + population: + title: Population + description_short: Population of countries + unit: "people" + population_omm: + title: Population (omm) + description_short: Population of countries + unit: "people" + population_hyde: + title: Population (HYDE) + description_short: Population of countries + unit: "people" + population_gm: + title: Population (gapminder) + description_short: Population of countries + unit: "people" + origins: + - title: Gapminder + url: https://www.gapminder.org/data/ + producer: Gapminder + population_wpp: + title: Population (UN) + description_short: Population of countries + unit: "people" + diff: + title: Population difference + description_short: Population difference + unit: people + + population_hyde_cut: + title: Population (HYDE), cut + description_short: Population of countries + unit: "people" + population_gm_cut: + title: Population (gapminder), cut + description_short: Population of countries + unit: "people" + origins: + - title: Gapminder + url: https://www.gapminder.org/data/ + producer: Gapminder + population_wpp_cut: + title: Population (UN), cut + description_short: Population of countries + unit: "people" diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py new file mode 100644 index 00000000000..64128301579 --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -0,0 +1,198 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +YEAR_MIN = 1790 +YEAR_MAX = 1955 + +# Countries that are relevant for the analysis +COUNTRIES_RELEVANT = [ + "Afghanistan", + "Albania", + "Algeria", + "Andorra", + "Angola", + "Armenia", + "Azerbaijan", + "Bahrain", + "Bangladesh", + "Belarus", + "British Virgin Islands", + "Burundi", + "Cambodia", + "Cameroon", + "Central African Republic", + "Chad", + "Comoros", + "South Korea", + "North Korea", + "Democratic Republic of Congo", + "Djibouti", + "Eritrea", + "Falkland Islands", + "Gabon", + "Georgia", + "Germany", + "Guam", + "Iceland", + "India", + "Iran", + "Iraq", + "Iran", + "Ireland", + "Japan", + "Kazakhstan", + "Kenya", + "Kuwait", + "Kyrgyzstan", + "Laos", + "Latvia", + "Libya", + "Liechtenstein", + "Madagascar", + "Malawi", + "Mali", + "Marshall Islands", + "Mauritania", + "Micronesia (country)", # TODO + "Namibia", + "Nauru", + "New Caledonia", + "New Zealand", + "Niger", + "Nigeria", + "Niue", + # "Northern Mariana Islands", # TODO + # "Palestine", # TODO + "Oman", + "Palau", + "Paraquay", + "Qatar", + # "Russia", # TODO + "Rwanda", + "Reunion", + "Saint Helena", + "Saint Kitts and Nevis", + "Saint Lucia", + "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines", + "Samoa", + "San Marino", + "Sao Tome and Principe", + "Saudi Arabia", + "Senegal", + "Serbia", + "Sierra Leone", + "Slovakia", + "Solomon Islands", + "Somalia", + "South Africa", + "Sudan", + "Tajikistan", + "Thailand", + "Tonga", + "Turkmenistan", + "Uganda", + "Ukraine", + "United Arab Emirates", + "Uzbekistan", + "Vanuatu", + "Yemen", + "Zambia", +] + + +def standardize_tb(tb, tb_main, col_population: str = "population"): + tb = tb.loc[:, ["country", "year", "population"]] + tb = tb.loc[tb["country"].isin(tb_main["country"].unique())] + tb = tb.loc[(tb["year"] >= YEAR_MIN) & (tb["year"] <= YEAR_MAX)] + tb["population"] = tb["population"].round().astype("Int64") + + tb = tb.rename( + columns={ + "population": col_population, + } + ) + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("population_explore") + ds_omm = paths.load_dataset("population", namespace="demography") + ds_hyde = paths.load_dataset("baseline") + ds_gm = paths.load_dataset("population", namespace="gapminder") + ds_wpp = paths.load_dataset("un_wpp") + + # Read table from meadow dataset. + tb = ds_meadow["population_explore"].reset_index() + tb_omm = ds_omm["population"].reset_index() + tb_hyde = ds_hyde["population"].reset_index() + tb_gm = ds_gm["population"].reset_index() + tb_wpp = ds_wpp["population"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + # Format OMM column + tb_omm = standardize_tb(tb_omm, tb, "population_omm") + # Format HYDE + tb_hyde = standardize_tb(tb_hyde, tb, "population_hyde") + # Format Gapminder + tb_gm = standardize_tb(tb_gm, tb, "population_gm") + # Format WPP + tb_wpp = tb_wpp[ + (tb_wpp["age"] == "all") + & (tb_wpp["sex"] == "all") + & (tb_wpp["metric"] == "population") + & (tb_wpp["variant"] == "estimates") + ].rename(columns={"location": "country", "value": "population"}) + tb_wpp = standardize_tb(tb_wpp, tb, "population_wpp") + + # Merge + tb = pr.multi_merge( + tables=[tb, tb_omm, tb_hyde, tb_gm, tb_wpp], + on=["country", "year"], + how="outer", + ) + tb["diff"] = tb["population"] - tb["population_omm"] + + # Add cut versions + tb["population_hyde_cut"] = tb.loc[tb["year"] <= 1800, "population_hyde"] + tb["population_gm_cut"] = tb.loc[(tb["year"] >= 1801) & (tb["year"] <= 1950), "population_gm"] + tb["population_wpp_cut"] = tb.loc[tb["year"] >= 1950, "population_wpp"] + + # Filter relevant countries + # tb = tb.loc[tb["country"].isin(COUNTRIES_RELEVANT)] + + # Format + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + formats=["csv", "feather"], + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/owid/latest/population_explore.py b/etl/steps/data/grapher/owid/latest/population_explore.py new file mode 100644 index 00000000000..c2a096bb625 --- /dev/null +++ b/etl/steps/data/grapher/owid/latest/population_explore.py @@ -0,0 +1,14 @@ +from etl.helpers import PathFinder, create_dataset + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + ds_garden = paths.load_dataset("population_explore") + + tb = ds_garden["population_explore"] + + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + ds_grapher.save() diff --git a/etl/steps/data/meadow/owid/latest/population_explore.py b/etl/steps/data/meadow/owid/latest/population_explore.py new file mode 100644 index 00000000000..1ee8aa3f181 --- /dev/null +++ b/etl/steps/data/meadow/owid/latest/population_explore.py @@ -0,0 +1,43 @@ +"""Load a meadow dataset and create a garden dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + tb = paths.read_snap_table("population_explore.xlsx") + tb = tb.drop(index=range(0, 5)) + + tb = tb.melt( + id_vars=["source"], + var_name="country", + value_name="population", + ).rename(columns={"source": "year"}) + + # Scale + tb["population"] = (tb["population"] * 1000).astype(float).round().astype("Int64") + + # + # Process data. + # + tb = tb.format(["country", "year"], short_name="population_explore") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/snapshots/owid/latest/population_explore.py b/snapshots/owid/latest/population_explore.py new file mode 100644 index 00000000000..6af7d3655d3 --- /dev/null +++ b/snapshots/owid/latest/population_explore.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import click +from structlog import get_logger + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Log +log = get_logger() + + +######################################################################################################################## +# TODO: Temporarily using a local file until 2024 revision is released +# The download url should still be the same: +# https://population.un.org/wpp +######################################################################################################################## +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", type=str, help="Path to population local file.") +def main( + upload: bool, + path_to_file: str | None = None, +) -> None: + # Create a new snapshot. + snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/population_explore.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/owid/latest/population_explore.xlsx.dvc b/snapshots/owid/latest/population_explore.xlsx.dvc new file mode 100644 index 00000000000..803aa8c62b7 --- /dev/null +++ b/snapshots/owid/latest/population_explore.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + is_public: false + + origin: + # Data product / Snapshot + title: pop + description: |- + pop + date_published: "2024" + + # Citation + producer: "?" + citation_full: "Something here" + + # Files + url_main: "" + date_accessed: 2024--11-06 + + # License + license: + name: Open Government Licence v3.0 + url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ + +outs: + - md5: 50f6fdb368ab8068dbb19e5c029455a2 + size: 435873 + path: population_explore.xlsx