Skip to content

Commit

Permalink
Add huggingface-based data storage (#43)
Browse files Browse the repository at this point in the history
* Add huggingface-based data storage
Fixes #42

* Update deps and add GH upload action to test

* Format

* Adjust download script

* Add logging

* Fix download script

* Add auto-calibrate
  • Loading branch information
nikhilwoodruff authored Nov 27, 2024
1 parent 89b035d commit a9437a9
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 42 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,9 @@ jobs:
- name: Download data inputs
run: make download
env:
POLICYENGINE_UK_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_UK_DATA_GITHUB_TOKEN }}
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
- name: Build datasets
run: make data
env:
LITE_MODE: true
- name: Run tests
run: pytest
- name: Test documentation builds
Expand Down
13 changes: 6 additions & 7 deletions .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,23 @@ jobs:
run: make install
- name: Download data inputs
run: make download
env:
POLICYENGINE_UK_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_UK_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
env:
LITE_MODE: true
- name: Run tests
run: pytest
- name: Upload data
run: make upload
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
- name: Test documentation builds
run: make documentation
- name: Build Jupyter Book
run: make documentation
- name: Deploy documentation
uses: JamesIves/github-pages-deploy-action@releases/v4
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
BRANCH: gh-pages
FOLDER: docs/_build/html
branch: gh-pages
folder: docs/_build/html
publish-to-pypi:
name: Publish to PyPI
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions docker/policyengine_uk_data.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
FROM python:latest
COPY . .
RUN make install
RUN make data
RUN make upload
2 changes: 1 addition & 1 deletion policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class EnhancedFRS_2022_23(EnhancedFRS):
input_frs = ExtendedFRS_2022_23
time_period = 2022
end_year = 2028
url = "release://PolicyEngine/ukda/1.9.0/enhanced_frs_2022_23.h5"
url = "hf://policyengine/policyengine-uk-data"


def reweight(
Expand Down
2 changes: 1 addition & 1 deletion policyengine_uk_data/datasets/frs/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class FRS_2022_23(FRS):
label = "FRS (2022-23)"
file_path = STORAGE_FOLDER / "frs_2022_23.h5"
time_period = 2022
url = "release://PolicyEngine/ukda/1.9.0/frs_2022_23.h5"
url = "hf://policyengine/policyengine-uk-data"


def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame):
Expand Down
21 changes: 10 additions & 11 deletions policyengine_uk_data/storage/download_private_prerequisites.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from policyengine_uk_data.utils.github import download
from policyengine_uk_data.utils.huggingface import download, upload
from pathlib import Path
import zipfile


def extract_zipped_folder(folder):
folder = Path(folder)
with zipfile.ZipFile(folder, "r") as zip_ref:
zip_ref.extractall(folder.parent)
zip_ref.extractall(folder.parent / folder.stem)


FOLDER = Path(__file__).parent
Expand All @@ -19,15 +19,14 @@ def extract_zipped_folder(folder):
"spi_2020_21.zip",
]

FILES = [FOLDER / file for file in FILES]

for file in FILES:
if (FOLDER / file).exists():
continue
download(
"PolicyEngine",
"ukda",
"release",
file,
FOLDER / file,
repo="policyengine/policyengine-uk-data",
repo_filename=file.name,
local_folder=file.parent,
)
extract_zipped_folder(FOLDER / file)
(FOLDER / file).unlink()
print(f"Extracting {file}")
extract_zipped_folder(file)
file.unlink()
25 changes: 6 additions & 19 deletions policyengine_uk_data/storage/upload_completed_datasets.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,8 @@
from policyengine_uk_data.utils.github import upload
from pathlib import Path
from tqdm import tqdm
from policyengine_uk_data.datasets import EnhancedFRS_2022_23, FRS_2022_23

FOLDER = Path(__file__).parent
datasets = [EnhancedFRS_2022_23, FRS_2022_23]

FILES = [
"frs_2022_23.h5",
"enhanced_frs_2022_23.h5",
"extended_frs_2022_23.h5",
"reweighted_frs_2022_23.h5",
]

for file in tqdm(FILES):
upload(
"PolicyEngine",
"ukda",
"release",
file,
FOLDER / file,
)
for dataset in datasets:
ds = dataset()
print(f"Uploading {ds.name} with url {ds.url}")
ds.upload()
34 changes: 34 additions & 0 deletions policyengine_uk_data/storage/upload_private_prerequisites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from policyengine_uk_data.utils.huggingface import upload
from pathlib import Path
import zipfile


def zip_folder(folder):
folder = Path(folder)
with zipfile.ZipFile(folder.with_suffix(".zip"), "w") as zip_ref:
for file in folder.glob("*"):
zip_ref.write(file, file.name)


FOLDER = Path(__file__).parent

FILES = [
"frs_2022_23.zip",
"lcfs_2021_22.zip",
"was_2006_20.zip",
"etb_1977_21.zip",
"spi_2020_21.zip",
]

FILES = [Path(FOLDER / file) for file in FILES]

for file in FILES:
if not file.exists():
zip_folder(FOLDER / file.name[:-4])
if not file.exists():
raise FileNotFoundError(f"File {file} not found")
upload(
repo="policyengine/policyengine-uk-data",
repo_file_path=file.name,
local_file_path=file,
)
34 changes: 34 additions & 0 deletions policyengine_uk_data/utils/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from huggingface_hub import hf_hub_download, login, HfApi
import os
import pkg_resources


def download(
repo: str, repo_filename: str, local_folder: str, version: str = None
):
token = os.environ.get(
"HUGGING_FACE_TOKEN",
)
login(token=token)

hf_hub_download(
repo_id=repo,
repo_type="model",
filename=repo_filename,
local_dir=local_folder,
revision=version,
)


def upload(local_file_path: str, repo: str, repo_file_path: str):
token = os.environ.get(
"HUGGING_FACE_TOKEN",
)
login(token=token)
api = HfApi()
api.upload_file(
path_or_fileobj=local_file_path,
path_in_repo=repo_file_path,
repo_id=repo,
repo_type="model",
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"requests",
"tqdm",
"tabulate",
"huggingface_hub",
]

[project.optional-dependencies]
Expand Down

0 comments on commit a9437a9

Please sign in to comment.