Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 mpox: get data from the WHO Shiny App #3308

Closed
wants to merge 15 commits into from
6 changes: 5 additions & 1 deletion dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -713,11 +713,12 @@ steps:
data://grapher/who/2024-08-06/mortality_database_cancer_most_common:
- data://garden/who/2024-08-06/mortality_database_cancer_most_common


# Mpox - WHO
data://meadow/who/latest/monkeypox:
- snapshot://who/latest/monkeypox.csv
data://garden/who/latest/monkeypox:
- data://meadow/who/latest/monkeypox
- data://meadow/who/latest/monkeypox_shiny
- data://garden/demography/2023-03-31/population
- data://garden/regions/2023-01-01/regions
- data://garden/health/latest/global_health_mpox
Expand All @@ -727,6 +728,9 @@ steps:
- data://garden/who/latest/monkeypox
export://github/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
# Mpox Shiny app
data://meadow/who/latest/monkeypox_shiny:
- snapshot://who/latest/monkeypox_shiny.csv
# Mpox - Global.health
data://meadow/health/latest/global_health_mpox:
- snapshot://health/latest/global_health_mpox.csv
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@
"Uganda": "Uganda",
"Burundi ": "Burundi",
"Republic of the Congo": "Congo",
"Guinea": "Guinea"
"Morocco": "Morocco",
"Guinea ": "Guinea"
}
84 changes: 67 additions & 17 deletions etl/steps/data/garden/who/latest/monkeypox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Load a meadow dataset and create a garden dataset."""

import datetime
from typing import List

import owid.catalog.processing as pr
import pandas as pd
Expand Down Expand Up @@ -31,20 +32,33 @@ def run(dest_dir: str) -> None:
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("monkeypox")
ds_suspected = paths.load_dataset("global_health_mpox")
ds_meadow_shiny = paths.load_dataset("monkeypox_shiny")
# ds_suspected = paths.load_dataset("global_health_mpox")
# Read table from meadow dataset.
tb = ds_meadow["monkeypox"].reset_index()
tb_suspected = ds_suspected["global_health_mpox"].reset_index()
cols = ["country", "date", "suspected_cases_cumulative"]
tb_suspected = tb_suspected[cols]
assert tb_suspected.shape[1] == len(cols)
tb["source"] = "xmart"
tb_africa = ds_meadow_shiny["monkeypox_shiny"].reset_index()
tb_africa["source"] = "shiny"
rename_dict = {
"week_end_date": "date",
"total_confirmed_cases": "total_conf_cases",
"total_deaths": "total_conf_deaths",
"new_confirmed_cases": "new_conf_cases",
"new_deaths": "new_conf_deaths",
}
tb_africa = tb_africa.rename(columns=rename_dict, errors="raise")
# tb_suspected = ds_suspected["global_health_mpox"].reset_index()
# cols = ["country", "date", "suspected_cases_cumulative"]
# tb_suspected = tb_suspected[cols]
# assert tb_suspected.shape[1] == len(cols)
origins = tb["total_conf_cases"].metadata.origins
#
# Process data.
#
tb_orig = tb.copy()
tb = geo.harmonize_countries(
df=tb,
tb_combine = pr.concat([tb, tb_africa], ignore_index=True)
tb_combine = geo.harmonize_countries(
df=tb_combine,
countries_file=paths.country_mapping_path,
make_missing_countries_nan=True,
)
Expand All @@ -55,6 +69,8 @@ def run(dest_dir: str) -> None:
log.warning(f"Missing countries in monkeypox.countries.json: {missing_countries}")
tb.country = tb.country.astype(str).fillna(tb_orig.country)

# Removing duplicates and preferring shiny data
tb = remove_duplicates(tb=tb_combine, preferred_source="shiny", dimensions=["country", "date"])
tb = (
tb.pipe(clean_columns)
.pipe(clean_date)
Expand All @@ -67,37 +83,71 @@ def run(dest_dir: str) -> None:
.pipe(filter_dates)
)

tb_both = pr.merge(tb, tb_suspected, on=["country", "date"], how="outer")
# tb_both = pr.merge(tb, tb_suspected, on=["country", "date"], how="outer")

# For variables on deaths we should show that data reported by the WHO show _only_ confirmed cases, in an annotation
country_mask = tb_both["country"] == "Democratic Republic of Congo"
tb_both["annotation"] = ""
tb_both.loc[country_mask, "annotation"] = (
tb_both.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO"
country_mask = tb["country"] == "Democratic Republic of Congo"
tb["annotation"] = ""
tb.loc[country_mask, "annotation"] = (
tb.loc[country_mask, "annotation"] + "Includes only confirmed deaths as reported by WHO"
)
tb_both["annotation"].metadata.origins = origins
tb_both = tb_both.format(["country", "date"])
tb["annotation"].metadata.origins = origins
tb = tb.format(["country", "date"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb_both], check_variables_metadata=True, default_metadata=ds_meadow.metadata
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()


def remove_duplicates(tb: Table, preferred_source: str, dimensions: List[str]) -> Table:
"""
Removing rows where there are overlapping years with a preference for IGME data.

"""
assert any(tb["source"] == preferred_source)
tb = tb.copy(deep=True)
duplicate_rows = tb.duplicated(subset=dimensions, keep=False)

tb_no_duplicates = tb[~duplicate_rows]

tb_duplicates = tb[duplicate_rows]

tb_duplicates_removed = tb_duplicates[tb_duplicates["source"] == preferred_source]

tb = pr.concat([tb_no_duplicates, tb_duplicates_removed], ignore_index=True)

assert len(tb[tb.duplicated(subset=dimensions, keep=False)]) == 0, "Duplicates still in table!"

return tb


def clean_columns(tb: Table) -> Table:
return tb.loc[:, ["country", "iso3", "date", "total_conf_cases", "total_conf_deaths"]].rename(
return tb.loc[
:,
[
"country",
"iso3",
"date",
"total_conf_cases",
"total_conf_deaths",
"total_suspected_cases",
"total_suspected_deaths",
],
].rename(
columns={
"date": "date",
"total_conf_cases": "total_cases",
"total_conf_deaths": "total_deaths",
"iso3": "iso_code",
}
},
errors="raise",
)


Expand Down
10 changes: 10 additions & 0 deletions etl/steps/data/garden/who/latest/monkeypox/monkeypox.meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ tables:
display:
numDecimalPlaces: 3
entityAnnotationsMap: "Democratic Republic of Congo: Includes only confirmed deaths as reported by WHO"
total_suspected_cases:
title: Total suspected cases
unit: cases
display:
numDecimalPlaces: 0
total_suspected_deaths:
title: Total suspected deaths
unit: deaths
display:
numDecimalPlaces: 0
iso_code:
title: ISO code
unit: ''
Expand Down
33 changes: 33 additions & 0 deletions etl/steps/data/meadow/who/latest/monkeypox_shiny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("monkeypox_shiny.csv")

# Load data from snapshot.
tb = snap.read()
# Some duplicate rows for Morocco in Sept 2022
tb = tb.drop_duplicates()
#
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(["country", "week_end_date"])

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
ds_meadow.save()
29 changes: 29 additions & 0 deletions snapshots/who/latest/monkeypox_shiny.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
producer: World Health Organization
title: Mpox confirmed cases and deaths
description: |-
Data on mpox is collated by the World Health Organization since 2022, and is updated as new information is reported.

We fetch the latest version of the WHO data every hour, keep records up to the previous day, apply some transformations (7-day averages, per-capita adjustments, etc.), and produce a transformed version of the data, available on GitHub. This transformed data powers our Mpox Data Explorer on Our World in Data.

# Citation
citation_full: |-
Mpox confirmed cases and deaths. World Health Organization; 2024.
attribution_short: WHO

# Files
url_main: snapshots/who/latest/monkeypox.csv.dvc snapshots/who/latest/monkeypox.py
date_accessed: 2024-08-07
date_published: "2024-08-07"

# License
license:
name: CC BY 4.0

outs:
- md5: b351cca2282c748a8378fd2351ed58a2
size: 64640
path: monkeypox_shiny.csv
61 changes: 61 additions & 0 deletions snapshots/who/latest/monkeypox_shiny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Script to create a snapshot of dataset."""

import base64
from io import StringIO
from pathlib import Path

import click
import pandas as pd
import requests
from bs4 import BeautifulSoup

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"who/{SNAPSHOT_VERSION}/monkeypox_shiny.csv")

df = get_shiny_data()

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(data=df, upload=upload)


def get_shiny_data():
# URL of the webpage
url = "https://worldhealthorg.shinyapps.io/mpx_global/#26_Case_definitions" # Replace with your actual URL

# Fetch the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the button tag with the specific class (for example)
button = soup.find("button", class_="btn btn-primary")

# Extract the value of the 'onclick' attribute
if button:
onclick_value = button.get("onclick")
# If the attribute contains 'data:text/csv;base64,', extract and decode it - should make this a bit more stable to ensure it can only download the button we want
if "data:text/csv;base64," in onclick_value:
base64_data = onclick_value.split("data:text/csv;base64,")[1].strip("')")
base64_data = base64_data.split(")")[0]
decoded_csv = base64.b64decode(base64_data).decode("utf-8")
csv_data = StringIO(decoded_csv)
df = pd.read_csv(csv_data)
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return df


if __name__ == "__main__":
main()
Loading