From c9e2c9b4dbbb16408b5e2910de59e01559f37978 Mon Sep 17 00:00:00 2001 From: Oliver Kinch Date: Tue, 5 Mar 2024 12:54:09 +0100 Subject: [PATCH] Problems with scrape tests --- config/scrape/scrape.yaml | 2 +- src/doms_databasen/scraper.py | 29 +++++++++++++++++------------ tests/scraper/test_scraper.py | 11 ++++++++++- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/config/scrape/scrape.yaml b/config/scrape/scrape.yaml index d5a89d06..e34be38c 100644 --- a/config/scrape/scrape.yaml +++ b/config/scrape/scrape.yaml @@ -20,4 +20,4 @@ test_case_id: "1" # Constants sleep: 5 max_consecutive_nonexistent_page_count: 100 -timeout_pdf_download: 20 +timeout_pdf_download: 10 diff --git a/src/doms_databasen/scraper.py b/src/doms_databasen/scraper.py index eeee01d3..7343f7a1 100644 --- a/src/doms_databasen/scraper.py +++ b/src/doms_databasen/scraper.py @@ -51,11 +51,7 @@ def __init__(self, config) -> None: """Initializes the Scraper.""" self.config = config self.test_dir = Path(self.config.scrape.paths.test_dir) - self.download_dir = ( - Path(self.config.scrape.paths.download_dir) - if not self.config.testing - else self.test_dir - ) + self.download_dir = Path(self.config.scrape.paths.download_dir) self.data_raw_dir = Path(self.config.paths.data_raw_dir) self.force = self.config.scrape.force @@ -78,7 +74,7 @@ def scrape(self, case_id: str) -> None: case_dir = ( self.data_raw_dir / case_id if not self.config.testing - else self.test_dir / self.config.scrape.test_case_name + else self.test_dir / case_id ) if self._already_scraped(case_dir) and not self.force: @@ -236,13 +232,14 @@ def _download_pdf(self, case_dir: Path) -> None: ) download_element.click() - file_name = self._wait_download(files_before_download) + file_name = self._wait_download(files_before=files_before_download) if file_name: - from_ = ( - self.download_dir / file_name - if not self.config.testing - else self.test_dir / file_name - ) + # print cwd + print("cwd", os.getcwd()) + # list dir + print("listdir", os.listdir()) + + from_ = self.download_dir / file_name to_ = case_dir / self.config.file_names.pdf_document shutil.move(from_, to_) else: @@ -315,3 +312,11 @@ def _element_exists(self, xpath) -> bool: except Exception as e: logger.error(e) raise e + + # # When closing the scraper, the webdriver should be closed. + # # and the download folder should be deleted. + # def __del__(self): + # """Closes the scraper.""" + # self.driver.quit() + # shutil.rmtree(self.download_dir) + # logger.info("Scraper closed") diff --git a/tests/scraper/test_scraper.py b/tests/scraper/test_scraper.py index ec197469..690c25d7 100644 --- a/tests/scraper/test_scraper.py +++ b/tests/scraper/test_scraper.py @@ -1,3 +1,5 @@ +"""Test the scraper module.""" + from pathlib import Path import pytest @@ -5,12 +7,19 @@ @pytest.fixture(scope="module") def test_case_path(config): - return Path(config.scrape.paths.test_dir) / config.scrape.test_case_name + """Return the path to the test case.""" + return Path(config.scrape.paths.test_dir) / config.scrape.test_case_id def test_case_contains_pdf(config, test_case_path): + """Test that the test case contains a PDF document.""" assert (test_case_path / config.file_names.pdf_document).exists() def test_case_contains_tabular_data(config, test_case_path): + """Test that the test case contains tabular data.""" assert (test_case_path / config.file_names.tabular_data).exists() + + +if __name__ == "__main__": + pytest.main([f"{__file__}::test_case_contains_pdf", "-s"])