Skip to content

Commit

Permalink
Fix datasette example and improve logging
Browse files Browse the repository at this point in the history
  • Loading branch information
ekzhang committed Oct 4, 2023
1 parent 0cc0255 commit b107582
Showing 1 changed file with 19 additions and 14 deletions.
33 changes: 19 additions & 14 deletions 10_integrations/covid_datasette.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
import asyncio
import pathlib
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta
from urllib.request import urlretrieve

from modal import Image, NetworkFileSystem, Period, Stub, asgi_app

Expand All @@ -35,10 +37,9 @@
.pip_install(
"datasette~=0.63.2",
"flufl.lock",
"GitPython",
"sqlite-utils",
)
.apt_install("git")
.apt_install("unzip")
)

# ## Persistent dataset storage
Expand All @@ -51,7 +52,7 @@

CACHE_DIR = "/cache"
LOCK_FILE = str(pathlib.Path(CACHE_DIR, "lock-reports"))
REPO_DIR = pathlib.Path(CACHE_DIR, "COVID-19")
REPORTS_DIR = pathlib.Path(CACHE_DIR, "COVID-19")
DB_PATH = pathlib.Path(CACHE_DIR, "covid-19.db")

# ## Getting a dataset
Expand All @@ -69,22 +70,28 @@
retries=2,
)
def download_dataset(cache=True):
import git
from flufl.lock import Lock

if REPO_DIR.exists() and cache:
if REPORTS_DIR.exists() and cache:
print(f"Dataset already present and {cache=}. Skipping download.")
return
elif REPO_DIR.exists():
elif REPORTS_DIR.exists():
print(
"Acquiring lock before deleting dataset, which may be in use by other runs."
)
with Lock(LOCK_FILE, default_timeout=timedelta(hours=1)):
shutil.rmtree(REPO_DIR)
shutil.rmtree(REPORTS_DIR)
print("Cleaned dataset before re-downloading.")

git_url = "https://github.com/CSSEGISandData/COVID-19"
git.Repo.clone_from(git_url, REPO_DIR, depth=1)
print("Downloading dataset...")
urlretrieve("https://github.com/CSSEGISandData/COVID-19/archive/refs/heads/master.zip", "/tmp/covid-19.zip")

print("Unpacking archive")
prefix = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports"
subprocess.run(f"unzip /tmp/covid-19.zip {prefix}/* -d {REPORTS_DIR}", shell=True)
subprocess.run(f"mv {REPORTS_DIR / prefix}/* {REPORTS_DIR}", shell=True)

print("Finished downloading dataset.")


# ## Data munging
Expand All @@ -96,11 +103,9 @@ def download_dataset(cache=True):


def load_daily_reports():
jhu_csse_base = REPO_DIR
reports_path = (
jhu_csse_base / "csse_covid_19_data" / "csse_covid_19_daily_reports"
)
daily_reports = list(reports_path.glob("*.csv"))
daily_reports = list(REPORTS_DIR.glob("*.csv"))
if not daily_reports:
raise RuntimeError(f"Could not find any daily reports in {REPORTS_DIR}.")
for filepath in daily_reports:
yield from load_report(filepath)

Expand Down

0 comments on commit b107582

Please sign in to comment.