Skip to content

Commit

Permalink
Mise à jour du domaine Legipêche dans Monitorfish et dans le scraper (#…
Browse files Browse the repository at this point in the history
…2630)

## Linked issues

- Resolve #2629
  • Loading branch information
VincentAntoine authored Oct 23, 2023
2 parents 8560e45 + 1d2c8ab commit 860ec79
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 79 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
WITH id_regulations AS (
SELECT
id,
jsonb_array_elements(regulatory_references) regulatory_reference
FROM regulations
WHERE regulatory_references != 'null'
),

updated_id_regulations AS (
SELECT
id,
jsonb_set(
regulatory_reference,
'{url}',
('"' || replace(
replace(
regulatory_reference->>'url',
'http://legipeche.metier.i2',
'http://legipeche.metier.e2.rie.gouv.fr'
),
'http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr',
'http://legipeche.metier.e2.rie.gouv.fr'
) || '"')::jsonb
) AS updated_regulatory_reference
FROM id_regulations
),

updated_id_regulations_agg AS (
SELECT
id,
jsonb_agg(updated_regulatory_reference) AS updated_regulatory_references
FROM updated_id_regulations
GROUP BY id
)

UPDATE regulations r
SET regulatory_references = u.updated_regulatory_references
FROM updated_id_regulations_agg u
WHERE r.id = u.id
27 changes: 9 additions & 18 deletions datascience/src/pipeline/flows/regulations_checkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def add_article_id(regulations: pd.DataFrame, url_column: str) -> pd.DataFrame:
legipeche_regex = re.compile(
(
r"^http://legipeche\.metier\."
r"(?:i2|intranets\.developpement-durable\.ader\.gouv\.fr)/"
r"e2\.rie\.gouv\.fr/"
r"(?:[a-zA-Z0-9-]*)"
r"-a(?P<article_id>\d+)"
r"\.html"
Expand Down Expand Up @@ -427,8 +427,10 @@ def get_unknown_links(
legipeche_regulations: pd.DataFrame,
) -> set:
"""
Returns the urls of `monitorfish_regulations` that do contain an `article_id`
known in `legipeche_regulations`.
Returns the urls of `monitorfish_regulations` whose `article_id`
is either not present in `legipeche_regulations` (i.e. referencing Legipeche
articles that might not exist) or null (which corresponds to urls that do not match
the legipeche url pattern and which usually point to external websites).
Args:
monitorfish_regulations (pd.DataFrame):
Expand Down Expand Up @@ -499,17 +501,13 @@ def get_dead_links(
dead_links_urls = []
for unknown_link in unknown_links:
try:
unknown_link_alias = unknown_link.replace(
"intranets.developpement-durable.ader.gouv.fr",
"i2",
)
logger.info(f"Testing {unknown_link_alias}")
r = requests.get(unknown_link_alias, timeout=10)
logger.info(f"Testing {unknown_link}")
r = requests.get(unknown_link, timeout=10)
r.raise_for_status()
except requests.Timeout:
try:
logger.info(f"{unknown_link_alias} timed out. Retrying with proxies...")
r = requests.get(unknown_link_alias, timeout=10, proxies=proxies)
logger.info(f"{unknown_link} timed out. Retrying with proxies...")
r = requests.get(unknown_link, timeout=10, proxies=proxies)
r.raise_for_status()
except requests.HTTPError:
logger.info(f"{unknown_link} is a dead link.")
Expand All @@ -523,7 +521,6 @@ def get_dead_links(
requests.exceptions.InvalidURL,
) as e:
logger.info(f"{unknown_link} is a dead link (error {type(e)}: {e}).")
logger.info(f"{unknown_link} is a dead link.")
dead_links_urls.append(unknown_link)

# null references are missing_references, not dead_links
Expand Down Expand Up @@ -620,14 +617,12 @@ def format_outdated_references(outdated_references: pd.DataFrame) -> pd.DataFram

@task(checkpoint=False)
def get_main_template() -> jinja2.environment.Template:

with open(EMAIL_TEMPLATES_LOCATION / "regulations_checkup/main.jinja", "r") as f:
return jinja2.Template(f.read())


@task(checkpoint=False)
def get_body_template() -> jinja2.environment.Template:

with open(EMAIL_TEMPLATES_LOCATION / "regulations_checkup/body.jinja", "r") as f:
return jinja2.Template(f.read())

Expand Down Expand Up @@ -698,7 +693,6 @@ def render_main(

@task(checkpoint=False)
def get_recipients() -> List[str]:

try:
assert CNSP_FRANCE_EMAIL_ADDRESS is not None
except AssertionError:
Expand All @@ -710,7 +704,6 @@ def get_recipients() -> List[str]:

@task(checkpoint=False)
def create_message(html: str, recipients: List[str]) -> EmailMessage:

msg = create_html_email(
to=recipients,
subject="[Monitorfish] Suivi des modifications Legipêche dans Monitorfish",
Expand All @@ -726,10 +719,8 @@ def send_message(msg: EmailMessage):


with Flow("Regulations checkup", executor=LocalDaskExecutor()) as flow:

flow_not_running = check_flow_not_running()
with case(flow_not_running, True):

# Parameters
proxies = Parameter("proxies", default=PROXIES)
backoffice_regulation_url = Parameter(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
class LegipecheSpider(scrapy.Spider):
name = "legipeche"
start_urls = [
"http://legipeche.metier.i2/bibliotheque-r3.html",
"http://legipeche.metier.e2.rie.gouv.fr/bibliotheque-r3.html",
]

def parse(self, response):

#####################################################
# Extract data from the page, if it is an article page
title = response.xpath('//main[@id="main"]/article/header/h1/text()').get()
Expand Down
8 changes: 4 additions & 4 deletions datascience/tests/test_data/emails/REGULATIONS_CHECKUP.html
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,15 @@ <h3>Modifications de pages existantes</h3>
<td>Reg. Facade 1</td>
<td>Morbihan - bivalves</td>
<td>Secteur 2</td>
<td><a href="http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/modified-regulation-a668.html">some other regulation</a></td>
<td><a href="http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html">some other regulation</a></td>
<td>Ajout de document</td>
<td><a href="http://bzh.other_3">Bretagne modified reg 3</a></td>
</tr>
<tr>
<td>Reg. Facade 1</td>
<td>Morbihan - bivalves</td>
<td>Secteur 2</td>
<td><a href="http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/modified-regulation-a668.html">some other regulation</a></td>
<td><a href="http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html">some other regulation</a></td>
<td>Suppression de document</td>
<td><a href="http://bzh.other_2">Bretagne modified reg 2</a></td>
</tr>
Expand Down Expand Up @@ -221,7 +221,7 @@ <h3>Liens morts dans Monitorfish</h3>
<td>Reg. Facade 2</td>
<td>Mediterranée - filets</td>
<td>Zone C</td>
<td><a href="http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/deleted-regulation-a671.html">Dead link regulation</a></td>
<td><a href="http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html">Dead link regulation</a></td>
</tr>
</tbody>
</table>
Expand Down Expand Up @@ -257,7 +257,7 @@ <h3>Réglementations périmées dans Monitorfish</h3>
<td>Reg. Facade 2</td>
<td>Mediterranée - filets</td>
<td>Zone B</td>
<td><a href="http://legipeche.metier.i2/regulation-a689.html">Med regulation</a></td>
<td><a href="http://legipeche.metier.e2.rie.gouv.fr/regulation-a689.html">Med regulation</a></td>
<td>2030-03-17 17:46:40</td>
</tr>
</tbody>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ INSERT INTO public.regulations (
id, law_type, topic, zone, regulatory_references, geometry
) VALUES
(1, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 1', '[{"url": "http://external.site.regulation", "reference": "External regulation", "endDate": 1500000000000}]', '0106000020E610000001000000010300000001000000050000000000000000000000000000000000000000000000000024400000000000000000000000000000244000000000000024400000000000000000000000000000244000000000000000000000000000000000'),
(2, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 2', '[{"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/some-regulation-a666.html?var=12", "reference": "some regulation"}, {"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/modified-regulation-a668.html", "reference": "some other regulation", "endDate": "infinite"}]', '0106000020E610000001000000010300000001000000050000000000000000005E4000000000000034C00000000000E0604000000000000034C00000000000E0604000000000000024C00000000000005E4000000000000024C00000000000005E4000000000000034C0'),
(2, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 2', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html?var=12", "reference": "some regulation"}, {"url": "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", "reference": "some other regulation", "endDate": "infinite"}]', '0106000020E610000001000000010300000001000000050000000000000000005E4000000000000034C00000000000E0604000000000000034C00000000000E0604000000000000024C00000000000005E4000000000000024C00000000000005E4000000000000034C0'),
(3, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone A', 'null', '0106000020E610000001000000010300000001000000050000000000000000004EC000000000000024400000000000804BC000000000000024400000000000804BC000000000000034400000000000004EC000000000000034400000000000004EC00000000000002440'),
(4, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone B', '[{"url": "http://legipeche.metier.i2/regulation-a689.html", "reference": "Med regulation", "endDate": 1900000000000}]', '0106000020E6100000010000000103000000010000000500000000000000000024C00000000000804640000000000040654000000000008046400000000000406540000000000000494000000000000024C0000000000000494000000000000024C00000000000804640'),
(5, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone C', '[{"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/deleted-regulation-a671.html", "reference": "Dead link regulation"}]', '0106000020E610000001000000010300000001000000050000000000000000805BC00000000000004E4000000000000059C00000000000004E4000000000000059C000000000008051400000000000805BC000000000008051400000000000805BC00000000000004E40');
(4, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone B', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/regulation-a689.html", "reference": "Med regulation", "endDate": 1900000000000}]', '0106000020E6100000010000000103000000010000000500000000000000000024C00000000000804640000000000040654000000000008046400000000000406540000000000000494000000000000024C0000000000000494000000000000024C00000000000804640'),
(5, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone C', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html", "reference": "Dead link regulation"}]', '0106000020E610000001000000010300000001000000050000000000000000805BC00000000000004E4000000000000059C00000000000004E4000000000000059C000000000008051400000000000805BC000000000008051400000000000805BC00000000000004E40');
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ DELETE FROM public.legipeche;
INSERT INTO public.legipeche (
extraction_datetime_utc, extraction_occurence, page_title, page_url, document_title, document_url
) VALUES
( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.i2/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'),
( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.i2/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.i2/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'),
( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.i2/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.i2/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.i2/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.i2/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.i2/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg');
( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'),
( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'),
( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg');
Loading

0 comments on commit 860ec79

Please sign in to comment.