From 4fc3a2ad179a639e9fd28cb370db7e4a93fc111b Mon Sep 17 00:00:00 2001 From: Oliver Kinch Date: Wed, 6 Mar 2024 09:50:48 +0100 Subject: [PATCH] Scrape date --- pyproject.toml | 3 +++ src/doms_databasen/_xpaths.py | 1 + src/doms_databasen/scraper.py | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 03fcc116..75d331fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,9 @@ extend-select = [ "I", "D", ] +exclude = [ + "src/doms_databasen/_xpaths.py", +] [tool.ruff.pydocstyle] convention = "google" diff --git a/src/doms_databasen/_xpaths.py b/src/doms_databasen/_xpaths.py index 2275fa1a..9eb8a048 100644 --- a/src/doms_databasen/_xpaths.py +++ b/src/doms_databasen/_xpaths.py @@ -7,6 +7,7 @@ "Accept cookies": "//a[@id='CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll']", "Øvrige sagsoplysninger": "//span[@class='accordion-title'][contains(text(), 'Øvrige sagsoplysninger')]", "Sagen er ikke tilgængelig": "//h1[contains(text(), 'Sagen er ikke tilgængelig')]", + "Dato": "//tr[@tabindex='0']//td[1]", } XPATHS_TABULAR_DATA = { "Overskrift": "//h4[contains(text(), 'Overskrift')]/following-sibling::span[1]/p", diff --git a/src/doms_databasen/scraper.py b/src/doms_databasen/scraper.py index 6333e790..8a6ad937 100644 --- a/src/doms_databasen/scraper.py +++ b/src/doms_databasen/scraper.py @@ -2,6 +2,7 @@ import logging import os +import re import shutil import time from pathlib import Path @@ -255,8 +256,27 @@ def _get_tabular_data(self) -> dict: element = self.driver.find_element(By.XPATH, xpath) tabular_data[key] = element.text.strip() + # Not part of the tabular data table, but + # we will include the date of the case here. + tabular_data["Dato"] = self._get_date() + return tabular_data + def _get_date(self) -> str: + """Gets the date of the case. + + Returns: + date (str): + Date of the case + """ + date = "" + element = self.driver.find_element(By.XPATH, XPATHS["Dato"]) + # Datetime is on format "dd-mm-yyyy" + found = re.search(r"\d{2}-\d{2}-\d{4}", element.text.strip()) + if found: + date = found.group() + return date + def _accept_cookies(self) -> None: """Accepts cookies on the page.""" element = WebDriverWait(self.driver, self.config.scrape.sleep).until(