diff --git a/dag/health.yml b/dag/health.yml index c49a569f498..52f745f03cd 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -715,11 +715,12 @@ steps: data://grapher/who/2024-08-06/mortality_database_cancer_most_common: - data://garden/who/2024-08-06/mortality_database_cancer_most_common - + # Mpox - WHO data://meadow/who/latest/monkeypox: - snapshot://who/latest/monkeypox.csv data://garden/who/latest/monkeypox: - data://meadow/who/latest/monkeypox + - data://meadow/who/latest/monkeypox_shiny - data://garden/demography/2023-03-31/population - data://garden/regions/2023-01-01/regions - data://garden/health/latest/global_health_mpox @@ -729,6 +730,9 @@ steps: - data://garden/who/latest/monkeypox export://github/who/latest/monkeypox: - data://garden/who/latest/monkeypox +# Mpox Shiny app + data://meadow/who/latest/monkeypox_shiny: + - snapshot://who/latest/monkeypox_shiny.csv # Mpox - Global.health data://meadow/health/latest/global_health_mpox: - snapshot://health/latest/global_health_mpox.csv diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.countries.json b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json index 1c00c5f53c0..65c20f51829 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.countries.json +++ b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json @@ -16,5 +16,6 @@ "Uganda": "Uganda", "Burundi ": "Burundi", "Republic of the Congo": "Congo", - "Guinea": "Guinea" + "Morocco": "Morocco", + "Guinea ": "Guinea" } diff --git a/etl/steps/data/garden/who/latest/monkeypox/__init__.py b/etl/steps/data/garden/who/latest/monkeypox/__init__.py index d94630b5fdc..059405c5c3b 100644 --- a/etl/steps/data/garden/who/latest/monkeypox/__init__.py +++ b/etl/steps/data/garden/who/latest/monkeypox/__init__.py @@ -1,6 +1,7 @@ """Load a meadow dataset and create a garden dataset.""" import datetime +from typing import List import owid.catalog.processing as pr import pandas as pd @@ -31,20 +32,24 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("monkeypox") - ds_suspected = paths.load_dataset("global_health_mpox") + ds_meadow_shiny = paths.load_dataset("monkeypox_shiny") + # ds_suspected = paths.load_dataset("global_health_mpox") # Read table from meadow dataset. tb = ds_meadow["monkeypox"].reset_index() - tb_suspected = ds_suspected["global_health_mpox"].reset_index() - cols = ["country", "date", "suspected_cases_cumulative"] - tb_suspected = tb_suspected[cols] - assert tb_suspected.shape[1] == len(cols) + tb["source"] = "xmart" + tb_africa = ds_meadow_shiny["monkeypox_shiny"].reset_index() + rename_dict = {"week_end_date": "date"} + tb_africa = tb_africa.rename(columns=rename_dict, errors="raise") + tb_africa = format_africa(tb_africa) + # Grab origins metadata origins = tb["total_conf_cases"].metadata.origins # # Process data. # tb_orig = tb.copy() - tb = geo.harmonize_countries( - df=tb, + tb_combine = pr.concat([tb, tb_africa], ignore_index=True) + tb_combine = geo.harmonize_countries( + df=tb_combine, countries_file=paths.country_mapping_path, make_missing_countries_nan=True, ) @@ -55,6 +60,8 @@ def run(dest_dir: str) -> None: log.warning(f"Missing countries in monkeypox.countries.json: {missing_countries}") tb.country = tb.country.astype(str).fillna(tb_orig.country) + # Removing duplicates and preferring shiny data + tb = remove_duplicates(tb=tb_combine, preferred_source="shiny", dimensions=["country", "date"]) tb = ( tb.pipe(clean_columns) .pipe(clean_date) @@ -67,37 +74,122 @@ def run(dest_dir: str) -> None: .pipe(filter_dates) ) - tb_both = pr.merge(tb, tb_suspected, on=["country", "date"], how="outer") - # For variables on deaths we should show that data reported by the WHO show _only_ confirmed cases, in an annotation - country_mask = tb_both["country"] == "Democratic Republic of Congo" - tb_both["annotation"] = "" - tb_both.loc[country_mask, "annotation"] = ( - tb_both.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO" + country_mask = tb["country"] == "Democratic Republic of Congo" + tb["annotation"] = "" + tb.loc[country_mask, "annotation"] = ( + tb.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO" ) - tb_both["annotation"].metadata.origins = origins - tb_both = tb_both.format(["country", "date"]) + tb["annotation"].metadata.origins = origins + tb = tb.format(["country", "date"]) # # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb_both], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata ) # Save changes in the new garden dataset. ds_garden.save() +def fix_suspected_cases(tb: Table) -> Table: + # The WHO reports *both* confirmed and suspected cases, as total suspected cases. + # We need to subtract the confirmed cases from the suspected cases to get the true suspected cases. + # tb["total_suspected_cases"] = tb["total_suspected_cases"].astype(int) - tb["total_confirmed_cases"].astype(int) + # tb["total_suspected_deaths"] = tb["total_suspected_deaths"].astype(int) - tb["total_deaths"].astype(int) + tb["new_suspected_cases"] = tb["new_suspected_cases"].astype(int) - tb["new_confirmed_cases"].astype(int) + tb["new_suspected_deaths"] = tb["new_suspected_deaths"].astype(int) - tb["new_deaths"].astype(int) + tb["total_suspected_cases"] = tb.groupby("country").new_suspected_cases.cumsum() + tb["total_suspected_deaths"] = tb.groupby("country").new_suspected_deaths.cumsum() + + assert tb["total_suspected_cases"].min() >= 0 + assert tb["total_suspected_deaths"].min() >= 0 + assert tb["new_suspected_cases"].min() >= 0 + assert tb["new_suspected_deaths"].min() >= 0 + + return tb + + +def format_africa(tb: Table) -> Table: + # Adding total suspected cases for 2023 and removing the confirmed cases from the suspected cases. + tb["date"] = pd.to_datetime(tb["date"], errors="coerce") # Converts to datetime, invalid parsing will result in Na + tb = tb[tb["date"] > "2023-12-24"] + tb = fix_suspected_cases(tb) + tb = tb[ + [ + "country", + "date", + "total_suspected_cases", + "total_suspected_deaths", + "new_suspected_cases", + "new_suspected_deaths", + ] + ].copy(deep=True) + # Check there are no suspected cases before 2023 + assert tb[tb["date"] <= "2023-12-24"]["total_suspected_cases"].sum() == 0 + + tb_2023 = Table( + { + "country": ["Cameroon", "Congo", "Democratic Republic of Congo"], + "date": ["2023-12-24", "2023-12-24", "2023-12-24"], + "new_suspected_cases": ["113", "74", "12985"], + } + ) + + tb = pr.concat([tb, tb_2023], ignore_index=True).sort_values(["country", "date"]) + tb["new_suspected_cases"] = pr.to_numeric(tb["new_suspected_cases"], errors="coerce") + tb["total_suspected_cases"] = tb["new_suspected_cases"].groupby(tb["country"]).cumsum() + # Check it's worked + assert tb.loc[(tb["country"] == "Congo") & (tb["date"] == "2023-12-24"), "total_suspected_cases"].min() >= 74 + # Suspected cases are the sum of confirmed and suspected cases + + return tb + + +def remove_duplicates(tb: Table, preferred_source: str, dimensions: List[str]) -> Table: + """ + Removing rows where there are overlapping years with a preference for `preferred_source` + """ + assert any(tb["source"] == preferred_source) + tb = tb.copy(deep=True) + duplicate_rows = tb.duplicated(subset=dimensions, keep=False) + + tb_no_duplicates = tb[~duplicate_rows] + + tb_duplicates = tb[duplicate_rows] + + tb_duplicates_removed = tb_duplicates[tb_duplicates["source"] == preferred_source] + + tb = pr.concat([tb_no_duplicates, tb_duplicates_removed], ignore_index=True) + + assert len(tb[tb.duplicated(subset=dimensions, keep=False)]) == 0, "Duplicates still in table!" + + return tb + + def clean_columns(tb: Table) -> Table: - return tb.loc[:, ["country", "iso3", "date", "total_conf_cases", "total_conf_deaths"]].rename( + return tb.loc[ + :, + [ + "country", + "iso3", + "date", + "total_conf_cases", + "total_conf_deaths", + "total_suspected_cases", + "total_suspected_deaths", + ], + ].rename( columns={ "date": "date", "total_conf_cases": "total_cases", "total_conf_deaths": "total_deaths", "iso3": "iso_code", - } + }, + errors="raise", ) diff --git a/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml b/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml index acfd2b2ac63..abf1fd6c69b 100644 --- a/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml +++ b/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml @@ -78,6 +78,16 @@ tables: display: numDecimalPlaces: 3 entityAnnotationsMap: "Democratic Republic of Congo: Includes only confirmed deaths as reported by WHO" + total_suspected_cases: + title: Total suspected cases + unit: cases + display: + numDecimalPlaces: 0 + total_suspected_deaths: + title: Total suspected deaths + unit: deaths + display: + numDecimalPlaces: 0 iso_code: title: ISO code unit: '' diff --git a/etl/steps/data/meadow/who/latest/monkeypox_shiny.py b/etl/steps/data/meadow/who/latest/monkeypox_shiny.py new file mode 100644 index 00000000000..af2e360c5d9 --- /dev/null +++ b/etl/steps/data/meadow/who/latest/monkeypox_shiny.py @@ -0,0 +1,33 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("monkeypox_shiny.csv") + + # Load data from snapshot. + tb = snap.read() + # Some duplicate rows for Morocco in Sept 2022 + tb = tb.drop_duplicates() + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "week_end_date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/who/latest/monkeypox_shiny.csv.dvc b/snapshots/who/latest/monkeypox_shiny.csv.dvc new file mode 100644 index 00000000000..fc21fd86fe4 --- /dev/null +++ b/snapshots/who/latest/monkeypox_shiny.csv.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + producer: World Health Organization + title: Mpox confirmed cases and deaths + description: |- + Data on mpox is collated by the World Health Organization since 2022, and is updated as new information is reported. + + We fetch the latest version of the WHO data every hour, keep records up to the previous day, apply some transformations (7-day averages, per-capita adjustments, etc.), and produce a transformed version of the data, available on GitHub. This transformed data powers our Mpox Data Explorer on Our World in Data. + + # Citation + citation_full: |- + Mpox confirmed cases and deaths. World Health Organization; 2024. + attribution_short: WHO + + # Files + url_main: snapshots/who/latest/monkeypox.csv.dvc snapshots/who/latest/monkeypox.py + date_accessed: 2024-08-07 + date_published: "2024-08-07" + + # License + license: + name: CC BY 4.0 + +outs: + - md5: e268d82e35fed99e41a75711bc26232f + size: 69014 + path: monkeypox_shiny.csv diff --git a/snapshots/who/latest/monkeypox_shiny.py b/snapshots/who/latest/monkeypox_shiny.py new file mode 100644 index 00000000000..09d986f649c --- /dev/null +++ b/snapshots/who/latest/monkeypox_shiny.py @@ -0,0 +1,61 @@ +"""Script to create a snapshot of dataset.""" + +import base64 +from io import StringIO +from pathlib import Path + +import click +import pandas as pd +import requests +from bs4 import BeautifulSoup + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/monkeypox_shiny.csv") + + df = get_shiny_data() + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(data=df, upload=upload) + + +def get_shiny_data(): + # URL of the webpage + url = "https://worldhealthorg.shinyapps.io/mpx_global/#26_Case_definitions" + + # Fetch the page content + response = requests.get(url) + + # Check if the request was successful + if response.status_code == 200: + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(response.content, "html.parser") + + # Find the button tag with the specific class (for example) + button = soup.find("button", class_="btn btn-primary") + + # Extract the value of the 'onclick' attribute + if button: + onclick_value = button.get("onclick") + # If the attribute contains 'data:text/csv;base64,', extract and decode it - should make this a bit more stable to ensure it can only download the button we want + if "data:text/csv;base64," in onclick_value: + base64_data = onclick_value.split("data:text/csv;base64,")[1].strip("')") + base64_data = base64_data.split(")")[0] + decoded_csv = base64.b64decode(base64_data).decode("utf-8") + csv_data = StringIO(decoded_csv) + df = pd.read_csv(csv_data) + else: + print(f"Failed to retrieve the webpage. Status code: {response.status_code}") + return df + + +if __name__ == "__main__": + main()