diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index fe8af68..7f2e498 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -44,11 +44,9 @@ jobs: - name: Download data inputs run: make download env: - POLICYENGINE_UK_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_UK_DATA_GITHUB_TOKEN }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - name: Build datasets run: make data - env: - LITE_MODE: true - name: Run tests run: pytest - name: Test documentation builds diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 1a93160..9893fcf 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -39,14 +39,14 @@ jobs: run: make install - name: Download data inputs run: make download - env: - POLICYENGINE_UK_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_UK_DATA_GITHUB_TOKEN }} - name: Build datasets run: make data - env: - LITE_MODE: true - name: Run tests run: pytest + - name: Upload data + run: make upload + env: + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - name: Test documentation builds run: make documentation - name: Build Jupyter Book @@ -54,9 +54,8 @@ jobs: - name: Deploy documentation uses: JamesIves/github-pages-deploy-action@releases/v4 with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages - FOLDER: docs/_build/html + branch: gh-pages + folder: docs/_build/html publish-to-pypi: name: Publish to PyPI runs-on: ubuntu-latest diff --git a/docker/policyengine_uk_data.Dockerfile b/docker/policyengine_uk_data.Dockerfile index bc4ca90..0048b81 100644 --- a/docker/policyengine_uk_data.Dockerfile +++ b/docker/policyengine_uk_data.Dockerfile @@ -1,3 +1,5 @@ FROM python:latest COPY . . RUN make install +RUN make data +RUN make upload diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py index e8e3253..91bbffd 100644 --- a/policyengine_uk_data/datasets/frs/enhanced_frs.py +++ b/policyengine_uk_data/datasets/frs/enhanced_frs.py @@ -84,7 +84,7 @@ class EnhancedFRS_2022_23(EnhancedFRS): input_frs = ExtendedFRS_2022_23 time_period = 2022 end_year = 2028 - url = "release://PolicyEngine/ukda/1.9.0/enhanced_frs_2022_23.h5" + url = "hf://policyengine/policyengine-uk-data" def reweight( diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py index 8d850b5..9c30f52 100644 --- a/policyengine_uk_data/datasets/frs/frs.py +++ b/policyengine_uk_data/datasets/frs/frs.py @@ -138,7 +138,7 @@ class FRS_2022_23(FRS): label = "FRS (2022-23)" file_path = STORAGE_FOLDER / "frs_2022_23.h5" time_period = 2022 - url = "release://PolicyEngine/ukda/1.9.0/frs_2022_23.h5" + url = "hf://policyengine/policyengine-uk-data" def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame): diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index 4234142..ef81526 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -1,4 +1,4 @@ -from policyengine_uk_data.utils.github import download +from policyengine_uk_data.utils.huggingface import download, upload from pathlib import Path import zipfile @@ -6,7 +6,7 @@ def extract_zipped_folder(folder): folder = Path(folder) with zipfile.ZipFile(folder, "r") as zip_ref: - zip_ref.extractall(folder.parent) + zip_ref.extractall(folder.parent / folder.stem) FOLDER = Path(__file__).parent @@ -19,15 +19,14 @@ def extract_zipped_folder(folder): "spi_2020_21.zip", ] +FILES = [FOLDER / file for file in FILES] + for file in FILES: - if (FOLDER / file).exists(): - continue download( - "PolicyEngine", - "ukda", - "release", - file, - FOLDER / file, + repo="policyengine/policyengine-uk-data", + repo_filename=file.name, + local_folder=file.parent, ) - extract_zipped_folder(FOLDER / file) - (FOLDER / file).unlink() + print(f"Extracting {file}") + extract_zipped_folder(file) + file.unlink() diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 513bbbb..d7fab41 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -1,21 +1,8 @@ -from policyengine_uk_data.utils.github import upload -from pathlib import Path -from tqdm import tqdm +from policyengine_uk_data.datasets import EnhancedFRS_2022_23, FRS_2022_23 -FOLDER = Path(__file__).parent +datasets = [EnhancedFRS_2022_23, FRS_2022_23] -FILES = [ - "frs_2022_23.h5", - "enhanced_frs_2022_23.h5", - "extended_frs_2022_23.h5", - "reweighted_frs_2022_23.h5", -] - -for file in tqdm(FILES): - upload( - "PolicyEngine", - "ukda", - "release", - file, - FOLDER / file, - ) +for dataset in datasets: + ds = dataset() + print(f"Uploading {ds.name} with url {ds.url}") + ds.upload() diff --git a/policyengine_uk_data/storage/upload_private_prerequisites.py b/policyengine_uk_data/storage/upload_private_prerequisites.py new file mode 100644 index 0000000..bc5bcac --- /dev/null +++ b/policyengine_uk_data/storage/upload_private_prerequisites.py @@ -0,0 +1,34 @@ +from policyengine_uk_data.utils.huggingface import upload +from pathlib import Path +import zipfile + + +def zip_folder(folder): + folder = Path(folder) + with zipfile.ZipFile(folder.with_suffix(".zip"), "w") as zip_ref: + for file in folder.glob("*"): + zip_ref.write(file, file.name) + + +FOLDER = Path(__file__).parent + +FILES = [ + "frs_2022_23.zip", + "lcfs_2021_22.zip", + "was_2006_20.zip", + "etb_1977_21.zip", + "spi_2020_21.zip", +] + +FILES = [Path(FOLDER / file) for file in FILES] + +for file in FILES: + if not file.exists(): + zip_folder(FOLDER / file.name[:-4]) + if not file.exists(): + raise FileNotFoundError(f"File {file} not found") + upload( + repo="policyengine/policyengine-uk-data", + repo_file_path=file.name, + local_file_path=file, + ) diff --git a/policyengine_uk_data/utils/huggingface.py b/policyengine_uk_data/utils/huggingface.py new file mode 100644 index 0000000..a46da04 --- /dev/null +++ b/policyengine_uk_data/utils/huggingface.py @@ -0,0 +1,34 @@ +from huggingface_hub import hf_hub_download, login, HfApi +import os +import pkg_resources + + +def download( + repo: str, repo_filename: str, local_folder: str, version: str = None +): + token = os.environ.get( + "HUGGING_FACE_TOKEN", + ) + login(token=token) + + hf_hub_download( + repo_id=repo, + repo_type="model", + filename=repo_filename, + local_dir=local_folder, + revision=version, + ) + + +def upload(local_file_path: str, repo: str, repo_file_path: str): + token = os.environ.get( + "HUGGING_FACE_TOKEN", + ) + login(token=token) + api = HfApi() + api.upload_file( + path_or_fileobj=local_file_path, + path_in_repo=repo_file_path, + repo_id=repo, + repo_type="model", + ) diff --git a/pyproject.toml b/pyproject.toml index dae1cb8..56e6edd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "requests", "tqdm", "tabulate", + "huggingface_hub", ] [project.optional-dependencies]