diff --git a/main.py b/main.py index 46ed7e2..dcf1cb4 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import argparse import logging +import datasets from dotenv import load_dotenv from multi_crawler import ArchiveCrawler, CSVExporter, YoutubeCrawler @@ -51,6 +52,12 @@ default=40, required=False, ) + argparser.add_argument( + "--huggingface_dataset", + type=str, + help="Name of the dataset to push to Huggingface Hub", + required=False, + ) args = argparser.parse_args() if args.csv and args.file_name is None: @@ -72,3 +79,7 @@ else: crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter) crawlers.crawl() + + if args.huggingface_dataset: + dataset = datasets.load_dataset("csv", data_files=args.file_name) + dataset.push_to_hub(args.huggingface_dataset) diff --git a/requirements.txt b/requirements.txt index af4b395..5c522d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ python-dotenv stem internetarchive pytubefix +datasets \ No newline at end of file