owid · spoonerf · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/dag/health.yml b/dag/health.yml
@@ -713,11 +713,12 @@ steps:
   data://grapher/who/2024-08-06/mortality_database_cancer_most_common:
     - data://garden/who/2024-08-06/mortality_database_cancer_most_common
 
-
+  # Mpox - WHO
   data://meadow/who/latest/monkeypox:
     - snapshot://who/latest/monkeypox.csv
   data://garden/who/latest/monkeypox:
     - data://meadow/who/latest/monkeypox
+    - data://meadow/who/latest/monkeypox_shiny
     - data://garden/demography/2023-03-31/population
     - data://garden/regions/2023-01-01/regions
     - data://garden/health/latest/global_health_mpox
@@ -727,6 +728,9 @@ steps:
     - data://garden/who/latest/monkeypox
   export://github/who/latest/monkeypox:
     - data://garden/who/latest/monkeypox
+# Mpox Shiny app
+  data://meadow/who/latest/monkeypox_shiny:
+    - snapshot://who/latest/monkeypox_shiny.csv
 # Mpox - Global.health
   data://meadow/health/latest/global_health_mpox:
     - snapshot://health/latest/global_health_mpox.csv

diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.countries.json b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json
@@ -16,5 +16,6 @@
   "Uganda": "Uganda",
   "Burundi ": "Burundi",
   "Republic of the Congo": "Congo",
-  "Guinea": "Guinea"
+  "Morocco": "Morocco",
+  "Guinea ": "Guinea"
 }
diff --git a/etl/steps/data/garden/who/latest/monkeypox/__init__.py b/etl/steps/data/garden/who/latest/monkeypox/__init__.py
@@ -1,6 +1,7 @@
 """Load a meadow dataset and create a garden dataset."""
 
 import datetime
+from typing import List
 
 import owid.catalog.processing as pr
 import pandas as pd
@@ -31,20 +32,33 @@ def run(dest_dir: str) -> None:
     #
     # Load meadow dataset.
     ds_meadow = paths.load_dataset("monkeypox")
-    ds_suspected = paths.load_dataset("global_health_mpox")
+    ds_meadow_shiny = paths.load_dataset("monkeypox_shiny")
+    # ds_suspected = paths.load_dataset("global_health_mpox")
     # Read table from meadow dataset.
     tb = ds_meadow["monkeypox"].reset_index()
-    tb_suspected = ds_suspected["global_health_mpox"].reset_index()
-    cols = ["country", "date", "suspected_cases_cumulative"]
-    tb_suspected = tb_suspected[cols]
-    assert tb_suspected.shape[1] == len(cols)
+    tb["source"] = "xmart"
+    tb_africa = ds_meadow_shiny["monkeypox_shiny"].reset_index()
+    tb_africa["source"] = "shiny"
+    rename_dict = {
+        "week_end_date": "date",
+        "total_confirmed_cases": "total_conf_cases",
+        "total_deaths": "total_conf_deaths",
+        "new_confirmed_cases": "new_conf_cases",
+        "new_deaths": "new_conf_deaths",
+    }
+    tb_africa = tb_africa.rename(columns=rename_dict, errors="raise")
+    # tb_suspected = ds_suspected["global_health_mpox"].reset_index()
+    # cols = ["country", "date", "suspected_cases_cumulative"]
+    # tb_suspected = tb_suspected[cols]
+    # assert tb_suspected.shape[1] == len(cols)
     origins = tb["total_conf_cases"].metadata.origins
     #
     # Process data.
     #
     tb_orig = tb.copy()
-    tb = geo.harmonize_countries(
-        df=tb,
+    tb_combine = pr.concat([tb, tb_africa], ignore_index=True)
+    tb_combine = geo.harmonize_countries(
+        df=tb_combine,
         countries_file=paths.country_mapping_path,
         make_missing_countries_nan=True,
     )
@@ -55,6 +69,8 @@ def run(dest_dir: str) -> None:
         log.warning(f"Missing countries in monkeypox.countries.json: {missing_countries}")
         tb.country = tb.country.astype(str).fillna(tb_orig.country)
 
+    # Removing duplicates and preferring shiny data
+    tb = remove_duplicates(tb=tb_combine, preferred_source="shiny", dimensions=["country", "date"])
     tb = (
         tb.pipe(clean_columns)
         .pipe(clean_date)
@@ -67,37 +83,71 @@ def run(dest_dir: str) -> None:
         .pipe(filter_dates)
     )
 
-    tb_both = pr.merge(tb, tb_suspected, on=["country", "date"], how="outer")
+    # tb_both = pr.merge(tb, tb_suspected, on=["country", "date"], how="outer")
 
     # For variables on deaths we should show that data reported by the WHO show _only_ confirmed cases, in an annotation
-    country_mask = tb_both["country"] == "Democratic Republic of Congo"
-    tb_both["annotation"] = ""
-    tb_both.loc[country_mask, "annotation"] = (
-        tb_both.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO"
+    country_mask = tb["country"] == "Democratic Republic of Congo"
+    tb["annotation"] = ""
+    tb.loc[country_mask, "annotation"] = (
+        tb.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO"
     )
-    tb_both["annotation"].metadata.origins = origins
-    tb_both = tb_both.format(["country", "date"])
+    tb["annotation"].metadata.origins = origins
+    tb = tb.format(["country", "date"])
 
     #
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
     ds_garden = create_dataset(
-        dest_dir, tables=[tb_both], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
     )
 
     # Save changes in the new garden dataset.
     ds_garden.save()
 
 
+def remove_duplicates(tb: Table, preferred_source: str, dimensions: List[str]) -> Table:
+    """
+    Removing rows where there are overlapping years with a preference for IGME data.
+
+    """
+    assert any(tb["source"] == preferred_source)
+    tb = tb.copy(deep=True)
+    duplicate_rows = tb.duplicated(subset=dimensions, keep=False)
+
+    tb_no_duplicates = tb[~duplicate_rows]
+
+    tb_duplicates = tb[duplicate_rows]
+
+    tb_duplicates_removed = tb_duplicates[tb_duplicates["source"] == preferred_source]
+
+    tb = pr.concat([tb_no_duplicates, tb_duplicates_removed], ignore_index=True)
+
+    assert len(tb[tb.duplicated(subset=dimensions, keep=False)]) == 0, "Duplicates still in table!"
+
+    return tb
+
+
 def clean_columns(tb: Table) -> Table:
-    return tb.loc[:, ["country", "iso3", "date", "total_conf_cases", "total_conf_deaths"]].rename(
+    return tb.loc[
+        :,
+        [
+            "country",
+            "iso3",
+            "date",
+            "total_conf_cases",
+            "total_conf_deaths",
+            "total_suspected_cases",
+            "total_suspected_deaths",
+        ],
+    ].rename(
         columns={
             "date": "date",
             "total_conf_cases": "total_cases",
             "total_conf_deaths": "total_deaths",
             "iso3": "iso_code",
-        }
+        },
+        errors="raise",
     )
 
 

diff --git a/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml b/etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml
@@ -78,6 +78,16 @@ tables:
         display:
           numDecimalPlaces: 3
           entityAnnotationsMap: "Democratic Republic of Congo: Includes only confirmed deaths as reported by WHO"
+      total_suspected_cases:
+        title: Total suspected cases
+        unit: cases
+        display:
+          numDecimalPlaces: 0
+      total_suspected_deaths:
+        title: Total suspected deaths
+        unit: deaths
+        display:
+          numDecimalPlaces: 0
       iso_code:
         title: ISO code
         unit: ''

diff --git a/etl/steps/data/meadow/who/latest/monkeypox_shiny.py b/etl/steps/data/meadow/who/latest/monkeypox_shiny.py
@@ -0,0 +1,33 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("monkeypox_shiny.csv")
+
+    # Load data from snapshot.
+    tb = snap.read()
+    # Some duplicate rows for Morocco in Sept 2022
+    tb = tb.drop_duplicates()
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["country", "week_end_date"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/who/latest/monkeypox_shiny.csv.dvc b/snapshots/who/latest/monkeypox_shiny.csv.dvc
@@ -0,0 +1,29 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    producer: World Health Organization
+    title: Mpox confirmed cases and deaths
+    description: |-
+      Data on mpox is collated by the World Health Organization since 2022, and is updated as new information is reported.
+
+      We fetch the latest version of the WHO data every hour, keep records up to the previous day, apply some transformations (7-day averages, per-capita adjustments, etc.), and produce a transformed version of the data, available on GitHub. This transformed data powers our Mpox Data Explorer on Our World in Data.
+
+    # Citation
+    citation_full: |-
+      Mpox confirmed cases and deaths. World Health Organization; 2024.
+    attribution_short: WHO
+
+    # Files
+    url_main: snapshots/who/latest/monkeypox.csv.dvc snapshots/who/latest/monkeypox.py
+    date_accessed: 2024-08-07
+    date_published: "2024-08-07"
+
+    # License
+    license:
+      name: CC BY 4.0
+
+outs:
+  - md5: b351cca2282c748a8378fd2351ed58a2
+    size: 64640
+    path: monkeypox_shiny.csv
diff --git a/snapshots/who/latest/monkeypox_shiny.py b/snapshots/who/latest/monkeypox_shiny.py
@@ -0,0 +1,61 @@
+"""Script to create a snapshot of dataset."""
+
+import base64
+from io import StringIO
+from pathlib import Path
+
+import click
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"who/{SNAPSHOT_VERSION}/monkeypox_shiny.csv")
+
+    df = get_shiny_data()
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(data=df, upload=upload)
+
+
+def get_shiny_data():
+    # URL of the webpage
+    url = "https://worldhealthorg.shinyapps.io/mpx_global/#26_Case_definitions"  # Replace with your actual URL
+
+    # Fetch the page content
+    response = requests.get(url)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Parse the HTML content with BeautifulSoup
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        # Find the button tag with the specific class (for example)
+        button = soup.find("button", class_="btn btn-primary")
+
+        # Extract the value of the 'onclick' attribute
+        if button:
+            onclick_value = button.get("onclick")
+            # If the attribute contains 'data:text/csv;base64,', extract and decode it - should make this a bit more stable to ensure it can only download the button we want
+            if "data:text/csv;base64," in onclick_value:
+                base64_data = onclick_value.split("data:text/csv;base64,")[1].strip("')")
+                base64_data = base64_data.split(")")[0]
+                decoded_csv = base64.b64decode(base64_data).decode("utf-8")
+                csv_data = StringIO(decoded_csv)
+                df = pd.read_csv(csv_data)
+    else:
+        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
+    return df
+
+
+if __name__ == "__main__":
+    main()