Skip to content

Commit

Permalink
Check dataset before push
Browse files Browse the repository at this point in the history
  • Loading branch information
oliverkinch committed Feb 13, 2024
1 parent 1e15e8f commit 2c88617
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/scripts/push_to_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


import logging
from pathlib import Path

import hydra
from datasets import load_dataset
Expand All @@ -17,8 +18,19 @@
@hydra.main(config_path="../../config", config_name="config")
def main(config: DictConfig) -> None:
dataset = load_dataset(config.paths.data_final_dir)

# Ensure that there is one sample in the
# dataset for each processed case
assert dataset.num_rows["train"] == _count_folders_in_dir(
Path(config.paths.data_processed_dir)
)

dataset.push_to_hub(config.hf_hub, private=True)


def _count_folders_in_dir(dir: Path) -> int:
return len([f for f in dir.iterdir() if f.is_dir()])


if __name__ == "__main__":
main()

0 comments on commit 2c88617

Please sign in to comment.