diff --git a/src/scripts/finalize.py b/src/scripts/finalize.py index ee2b44a1..5b6793b1 100644 --- a/src/scripts/finalize.py +++ b/src/scripts/finalize.py @@ -1,4 +1,6 @@ -"""This scripts finalizes the data by merging all the processed data into a single dataset. +"""Finalize dataset. + +This scripts finalizes the data by merging all the processed data into a single dataset. Usage: >>> python src/scripts/finalize.py @@ -14,22 +16,28 @@ from typing import Tuple import hydra +from doms_databasen._utils import append_jsonl, init_jsonl, read_json from omegaconf import DictConfig -from src.doms_databasen._utils import append_jsonl, init_jsonl, read_json - logger = getLogger(__name__) @hydra.main(config_path="../../config", config_name="config") def main(config: DictConfig) -> None: + """Finalize dataset. + + Args: + config (DictConfig): + Hydra config object. + """ data_processed_dir = Path(config.paths.data_processed_dir) data_final_dir = Path(config.paths.data_final_dir) dataset_path = data_final_dir / config.file_names.dataset if dataset_path.exists() and not config.finalize.force: logger.info( - f"Dataset already exists at {dataset_path}. Use 'finalize.force=True' to overwrite." + f"Dataset already exists at {dataset_path}." + "Use 'finalize.force=True' to overwrite." ) return diff --git a/src/scripts/process.py b/src/scripts/process.py index 3595b035..b9b44cdd 100644 --- a/src/scripts/process.py +++ b/src/scripts/process.py @@ -17,18 +17,20 @@ import logging import hydra +from doms_databasen.processor import Processor from omegaconf import DictConfig -from src.doms_databasen.processor import Processor - -# Importing as a module, doesn't work when running as a script? -# from doms_databasen.processor import DomsDatabasenScraper - logger = logging.getLogger(__name__) @hydra.main(config_path="../../config", config_name="config") def main(config: DictConfig) -> None: + """Process scraped data from the DomsDatabasen website. + + Args: + config (DictConfig): + Hydra config object. + """ processor = Processor(config=config) if config.process.all: processor.process_all() diff --git a/src/scripts/scrape.py b/src/scripts/scrape.py index 04c45e3b..d89f455c 100644 --- a/src/scripts/scrape.py +++ b/src/scripts/scrape.py @@ -17,18 +17,21 @@ import logging import hydra +from doms_databasen.scraper import DomsDatabasenScraper from omegaconf import DictConfig -from src.doms_databasen.scraper import DomsDatabasenScraper - -# Importing as a module, doesn't work when running as a script? -# from doms_databasen.scraper import DomsDatabasenScraper - logger = logging.getLogger(__name__) @hydra.main(config_path="../../config", config_name="config") def main(config: DictConfig) -> None: + """Scrape the DomsDatabasen website. + + Args: + config (DictConfig): + Hydra config object. + + """ scraper = DomsDatabasenScraper(config=config) if config.scrape.all: scraper.scrape_all()