From 3d5c5d9863d73322450bcdb308dd4ad59f746007 Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 23 Oct 2023 10:48:43 +0200 Subject: [PATCH 1/3] Update legipeche domain in flows --- .../src/pipeline/flows/regulations_checkup.py | 25 ++----- .../legipeche/spiders/legipeche_spider.py | 3 +- .../test_data/emails/REGULATIONS_CHECKUP.html | 8 +-- .../V666.24__Reset_test_regulations.sql | 6 +- .../V666.6__Reset_test_legipeche.sql | 24 +++---- .../test_flows/test_regulations_checkup.py | 67 ++++++++----------- 6 files changed, 54 insertions(+), 79 deletions(-) diff --git a/datascience/src/pipeline/flows/regulations_checkup.py b/datascience/src/pipeline/flows/regulations_checkup.py index 64e0e49d17..570156615c 100644 --- a/datascience/src/pipeline/flows/regulations_checkup.py +++ b/datascience/src/pipeline/flows/regulations_checkup.py @@ -171,7 +171,7 @@ def add_article_id(regulations: pd.DataFrame, url_column: str) -> pd.DataFrame: legipeche_regex = re.compile( ( r"^http://legipeche\.metier\." - r"(?:i2|intranets\.developpement-durable\.ader\.gouv\.fr)/" + r"e2\.rie\.gouv\.fr/" r"(?:[a-zA-Z0-9-]*)" r"-a(?P\d+)" r"\.html" @@ -427,8 +427,8 @@ def get_unknown_links( legipeche_regulations: pd.DataFrame, ) -> set: """ - Returns the urls of `monitorfish_regulations` that do contain an `article_id` - known in `legipeche_regulations`. + Returns the urls of `monitorfish_regulations` that contain an `article_id` + that is not present in `legipeche_regulations`. Args: monitorfish_regulations (pd.DataFrame): @@ -499,17 +499,13 @@ def get_dead_links( dead_links_urls = [] for unknown_link in unknown_links: try: - unknown_link_alias = unknown_link.replace( - "intranets.developpement-durable.ader.gouv.fr", - "i2", - ) - logger.info(f"Testing {unknown_link_alias}") - r = requests.get(unknown_link_alias, timeout=10) + logger.info(f"Testing {unknown_link}") + r = requests.get(unknown_link, timeout=10) r.raise_for_status() except requests.Timeout: try: - logger.info(f"{unknown_link_alias} timed out. Retrying with proxies...") - r = requests.get(unknown_link_alias, timeout=10, proxies=proxies) + logger.info(f"{unknown_link} timed out. Retrying with proxies...") + r = requests.get(unknown_link, timeout=10, proxies=proxies) r.raise_for_status() except requests.HTTPError: logger.info(f"{unknown_link} is a dead link.") @@ -523,7 +519,6 @@ def get_dead_links( requests.exceptions.InvalidURL, ) as e: logger.info(f"{unknown_link} is a dead link (error {type(e)}: {e}).") - logger.info(f"{unknown_link} is a dead link.") dead_links_urls.append(unknown_link) # null references are missing_references, not dead_links @@ -620,14 +615,12 @@ def format_outdated_references(outdated_references: pd.DataFrame) -> pd.DataFram @task(checkpoint=False) def get_main_template() -> jinja2.environment.Template: - with open(EMAIL_TEMPLATES_LOCATION / "regulations_checkup/main.jinja", "r") as f: return jinja2.Template(f.read()) @task(checkpoint=False) def get_body_template() -> jinja2.environment.Template: - with open(EMAIL_TEMPLATES_LOCATION / "regulations_checkup/body.jinja", "r") as f: return jinja2.Template(f.read()) @@ -698,7 +691,6 @@ def render_main( @task(checkpoint=False) def get_recipients() -> List[str]: - try: assert CNSP_FRANCE_EMAIL_ADDRESS is not None except AssertionError: @@ -710,7 +702,6 @@ def get_recipients() -> List[str]: @task(checkpoint=False) def create_message(html: str, recipients: List[str]) -> EmailMessage: - msg = create_html_email( to=recipients, subject="[Monitorfish] Suivi des modifications Legipêche dans Monitorfish", @@ -726,10 +717,8 @@ def send_message(msg: EmailMessage): with Flow("Regulations checkup", executor=LocalDaskExecutor()) as flow: - flow_not_running = check_flow_not_running() with case(flow_not_running, True): - # Parameters proxies = Parameter("proxies", default=PROXIES) backoffice_regulation_url = Parameter( diff --git a/datascience/src/pipeline/scraping/legipeche/legipeche/spiders/legipeche_spider.py b/datascience/src/pipeline/scraping/legipeche/legipeche/spiders/legipeche_spider.py index 6f54852066..131b6ab4fe 100644 --- a/datascience/src/pipeline/scraping/legipeche/legipeche/spiders/legipeche_spider.py +++ b/datascience/src/pipeline/scraping/legipeche/legipeche/spiders/legipeche_spider.py @@ -4,11 +4,10 @@ class LegipecheSpider(scrapy.Spider): name = "legipeche" start_urls = [ - "http://legipeche.metier.i2/bibliotheque-r3.html", + "http://legipeche.metier.e2.rie.gouv.fr/bibliotheque-r3.html", ] def parse(self, response): - ##################################################### # Extract data from the page, if it is an article page title = response.xpath('//main[@id="main"]/article/header/h1/text()').get() diff --git a/datascience/tests/test_data/emails/REGULATIONS_CHECKUP.html b/datascience/tests/test_data/emails/REGULATIONS_CHECKUP.html index 4e32201585..f71dbddc04 100644 --- a/datascience/tests/test_data/emails/REGULATIONS_CHECKUP.html +++ b/datascience/tests/test_data/emails/REGULATIONS_CHECKUP.html @@ -150,7 +150,7 @@

Modifications de pages existantes

Reg. Facade 1 Morbihan - bivalves Secteur 2 - some other regulation + some other regulation Ajout de document Bretagne modified reg 3 @@ -158,7 +158,7 @@

Modifications de pages existantes

Reg. Facade 1 Morbihan - bivalves Secteur 2 - some other regulation + some other regulation Suppression de document Bretagne modified reg 2 @@ -221,7 +221,7 @@

Liens morts dans Monitorfish

Reg. Facade 2 Mediterranée - filets Zone C - Dead link regulation + Dead link regulation @@ -257,7 +257,7 @@

Réglementations périmées dans Monitorfish

Reg. Facade 2 Mediterranée - filets Zone B - Med regulation + Med regulation 2030-03-17 17:46:40 diff --git a/datascience/tests/test_data/remote_database/V666.24__Reset_test_regulations.sql b/datascience/tests/test_data/remote_database/V666.24__Reset_test_regulations.sql index ab3e6c8f9f..a1fb416d0c 100644 --- a/datascience/tests/test_data/remote_database/V666.24__Reset_test_regulations.sql +++ b/datascience/tests/test_data/remote_database/V666.24__Reset_test_regulations.sql @@ -4,7 +4,7 @@ INSERT INTO public.regulations ( id, law_type, topic, zone, regulatory_references, geometry ) VALUES (1, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 1', '[{"url": "http://external.site.regulation", "reference": "External regulation", "endDate": 1500000000000}]', '0106000020E610000001000000010300000001000000050000000000000000000000000000000000000000000000000024400000000000000000000000000000244000000000000024400000000000000000000000000000244000000000000000000000000000000000'), - (2, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 2', '[{"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/some-regulation-a666.html?var=12", "reference": "some regulation"}, {"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/modified-regulation-a668.html", "reference": "some other regulation", "endDate": "infinite"}]', '0106000020E610000001000000010300000001000000050000000000000000005E4000000000000034C00000000000E0604000000000000034C00000000000E0604000000000000024C00000000000005E4000000000000024C00000000000005E4000000000000034C0'), + (2, 'Reg. Facade 1', 'Morbihan - bivalves', 'Secteur 2', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html?var=12", "reference": "some regulation"}, {"url": "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", "reference": "some other regulation", "endDate": "infinite"}]', '0106000020E610000001000000010300000001000000050000000000000000005E4000000000000034C00000000000E0604000000000000034C00000000000E0604000000000000024C00000000000005E4000000000000024C00000000000005E4000000000000034C0'), (3, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone A', 'null', '0106000020E610000001000000010300000001000000050000000000000000004EC000000000000024400000000000804BC000000000000024400000000000804BC000000000000034400000000000004EC000000000000034400000000000004EC00000000000002440'), - (4, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone B', '[{"url": "http://legipeche.metier.i2/regulation-a689.html", "reference": "Med regulation", "endDate": 1900000000000}]', '0106000020E6100000010000000103000000010000000500000000000000000024C00000000000804640000000000040654000000000008046400000000000406540000000000000494000000000000024C0000000000000494000000000000024C00000000000804640'), - (5, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone C', '[{"url": "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/deleted-regulation-a671.html", "reference": "Dead link regulation"}]', '0106000020E610000001000000010300000001000000050000000000000000805BC00000000000004E4000000000000059C00000000000004E4000000000000059C000000000008051400000000000805BC000000000008051400000000000805BC00000000000004E40'); + (4, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone B', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/regulation-a689.html", "reference": "Med regulation", "endDate": 1900000000000}]', '0106000020E6100000010000000103000000010000000500000000000000000024C00000000000804640000000000040654000000000008046400000000000406540000000000000494000000000000024C0000000000000494000000000000024C00000000000804640'), + (5, 'Reg. Facade 2', 'Mediterranée - filets', 'Zone C', '[{"url": "http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html", "reference": "Dead link regulation"}]', '0106000020E610000001000000010300000001000000050000000000000000805BC00000000000004E4000000000000059C00000000000004E4000000000000059C000000000008051400000000000805BC000000000008051400000000000805BC00000000000004E40'); diff --git a/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql b/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql index 2014149561..894a2256b8 100644 --- a/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql +++ b/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql @@ -3,15 +3,15 @@ DELETE FROM public.legipeche; INSERT INTO public.legipeche ( extraction_datetime_utc, extraction_occurence, page_title, page_url, document_title, document_url ) VALUES - ( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.i2/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'), - ( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.i2/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), - ( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.i2/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), - ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'), - ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'), - ( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'), - ( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.i2/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'), - ( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.i2/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), - ( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.i2/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), - ( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.i2/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), - ( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.i2/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), - ( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.i2/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg'); \ No newline at end of file + ( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'), + ( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), + ( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), + ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'), + ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'), + ( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'), + ( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'), + ( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), + ( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), + ( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), + ( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), + ( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg'); \ No newline at end of file diff --git a/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py b/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py index 2871ef4b0a..ded162afcc 100644 --- a/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py +++ b/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py @@ -94,18 +94,18 @@ def monitorfish_regulations() -> pd.DataFrame: "url": [ "http://external.site.regulation", ( - "http://legipeche.metier.intranets.developpement-durable" - ".ader.gouv.fr/some-regulation-a666.html?var=12" + "http://legipeche.metier.e2" + ".rie.gouv.fr/some-regulation-a666.html?var=12" ), ( - "http://legipeche.metier.intranets.developpement-durable" - ".ader.gouv.fr/modified-regulation-a668.html" + "http://legipeche.metier.e2" + ".rie.gouv.fr/modified-regulation-a668.html" ), None, - "http://legipeche.metier.i2/regulation-a689.html", + "http://legipeche.metier.e2.rie.gouv.fr/regulation-a689.html", ( - "http://legipeche.metier.intranets.developpement-durable" - ".ader.gouv.fr/deleted-regulation-a671.html" + "http://legipeche.metier.e2" + ".rie.gouv.fr/deleted-regulation-a671.html" ), ], "reference": [ @@ -131,7 +131,6 @@ def monitorfish_regulations() -> pd.DataFrame: @pytest.fixture def legipeche_regulations() -> pd.DataFrame: - d1 = datetime.datetime(2021, 3, 2, 14, 25, 0) d2 = datetime.datetime(2021, 3, 3, 14, 25, 0) @@ -167,18 +166,18 @@ def legipeche_regulations() -> pd.DataFrame: "Unused regulation 2", ], "page_url": [ - "http://legipeche.metier.i2/deleted-regulation-a671.html", - "http://legipeche.metier.i2/regulation-with-unstable-url-a689.html", - "http://legipeche.metier.i2/some-regulation-a666.html", - "http://legipeche.metier.i2/modified-regulation-a668.html", - "http://legipeche.metier.i2/modified-regulation-a668.html", - "http://legipeche.metier.i2/modified-regulation-a668.html", - "http://legipeche.metier.i2/modified-regulation-a668.html", - "http://legipeche.metier.i2/some-regulation-a666.html", - "http://legipeche.metier.i2/unused-regulation-a670.html", - "http://legipeche.metier.i2/regulation-with-unstable-url-a689.html", - "http://legipeche.metier.i2/unused-regulation-a670.html", - "http://legipeche.metier.i2/other-unused-regulation-a675.html", + "http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html", + "http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", + "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html", + "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", + "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", + "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", + "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", + "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html", + "http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html", + "http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", + "http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html", + "http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html", ], "document_title": [ "Some old reg text", @@ -257,13 +256,13 @@ def transformed_regulations() -> pd.DataFrame: "Zone": ["Secteur 2", "Secteur 2"], "Référence réglementaire": [ ( - 'some other regulation' + 'some other regulation' "" ), ( - 'some other regulation' + 'some other regulation' "" ), ], @@ -294,10 +293,7 @@ def missing_references() -> pd.DataFrame: def unknown_links() -> set: return { "http://external.site.regulation", - ( - "http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr/" - "deleted-regulation-a671.html" - ), + "http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html", } @@ -316,8 +312,8 @@ def formatted_dead_links(): "Référence réglementaire": [ 'External regulation', ( - '' + '' "Dead link regulation" ), ], @@ -341,7 +337,7 @@ def formatted_outdated_references(): "Référence réglementaire": [ 'External regulation', ( - '' + '' "Med regulation" ), ], @@ -448,9 +444,6 @@ def return_200(url, **kwargs): assert mock_get.call_count == 2 for unknown_link in unknown_links: - unknown_link = unknown_link.replace( - "intranets.developpement-durable.ader.gouv.fr", "i2" - ) mock_get.assert_any_call(unknown_link, timeout=10) pd.testing.assert_frame_equal(links, dead_links.head(0)) @@ -474,9 +467,6 @@ def return_404(url, **kwargs): assert mock_get.call_count == 2 for unknown_link in unknown_links: - unknown_link = unknown_link.replace( - "intranets.developpement-durable.ader.gouv.fr", "i2" - ) mock_get.assert_any_call(unknown_link, timeout=10) pd.testing.assert_frame_equal(links, dead_links) @@ -503,9 +493,6 @@ def raise_timeout_if_no_proxies(url, **kwargs): assert mock_get.call_count == 4 for unknown_link in unknown_links: - unknown_link = unknown_link.replace( - "intranets.developpement-durable.ader.gouv.fr", "i2" - ) mock_get.assert_any_call(unknown_link, timeout=10) mock_get.assert_any_call(unknown_link, timeout=10, proxies=PROXIES) From 32e229de37206626ff0e6f7978b207840fa98022 Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 23 Oct 2023 13:01:57 +0200 Subject: [PATCH 2/3] Update regulations links to legipeche --- .../V0.237__Update_regulations_table.sql | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 backend/src/main/resources/db/migration/internal/V0.237__Update_regulations_table.sql diff --git a/backend/src/main/resources/db/migration/internal/V0.237__Update_regulations_table.sql b/backend/src/main/resources/db/migration/internal/V0.237__Update_regulations_table.sql new file mode 100644 index 0000000000..c063e1643c --- /dev/null +++ b/backend/src/main/resources/db/migration/internal/V0.237__Update_regulations_table.sql @@ -0,0 +1,39 @@ +WITH id_regulations AS ( + SELECT + id, + jsonb_array_elements(regulatory_references) regulatory_reference + FROM regulations + WHERE regulatory_references != 'null' +), + +updated_id_regulations AS ( + SELECT + id, + jsonb_set( + regulatory_reference, + '{url}', + ('"' || replace( + replace( + regulatory_reference->>'url', + 'http://legipeche.metier.i2', + 'http://legipeche.metier.e2.rie.gouv.fr' + ), + 'http://legipeche.metier.intranets.developpement-durable.ader.gouv.fr', + 'http://legipeche.metier.e2.rie.gouv.fr' + ) || '"')::jsonb + ) AS updated_regulatory_reference + FROM id_regulations +), + +updated_id_regulations_agg AS ( + SELECT + id, + jsonb_agg(updated_regulatory_reference) AS updated_regulatory_references + FROM updated_id_regulations + GROUP BY id +) + +UPDATE regulations r +SET regulatory_references = u.updated_regulatory_references +FROM updated_id_regulations_agg u +WHERE r.id = u.id \ No newline at end of file From 1d2c8abeb05cf4b1feb85df1a944018963637fc7 Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 23 Oct 2023 13:09:27 +0200 Subject: [PATCH 3/3] Update docstring --- datascience/src/pipeline/flows/regulations_checkup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datascience/src/pipeline/flows/regulations_checkup.py b/datascience/src/pipeline/flows/regulations_checkup.py index 570156615c..e6561e68d9 100644 --- a/datascience/src/pipeline/flows/regulations_checkup.py +++ b/datascience/src/pipeline/flows/regulations_checkup.py @@ -427,8 +427,10 @@ def get_unknown_links( legipeche_regulations: pd.DataFrame, ) -> set: """ - Returns the urls of `monitorfish_regulations` that contain an `article_id` - that is not present in `legipeche_regulations`. + Returns the urls of `monitorfish_regulations` whose `article_id` + is either not present in `legipeche_regulations` (i.e. referencing Legipeche + articles that might not exist) or null (which corresponds to urls that do not match + the legipeche url pattern and which usually point to external websites). Args: monitorfish_regulations (pd.DataFrame):