From a4fd79b1c851e2b82fb5394512524bac0f87854e Mon Sep 17 00:00:00 2001 From: Pablo Orviz Date: Wed, 16 Oct 2024 11:57:17 +0200 Subject: [PATCH 1/4] Fix: gather 'start_date' and 'end_date' as part of temporal coverage gathering --- plugins/epos/plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/epos/plugin.py b/plugins/epos/plugin.py index 529db7d..bc16341 100644 --- a/plugins/epos/plugin.py +++ b/plugins/epos/plugin.py @@ -83,6 +83,7 @@ def _get_temporal_coverage(cls, element_values): "start_date": value_data.get("startDate", ""), "end_date": value_data.get("endDate", ""), } + for value_data in element_values ] @classmethod From 64709c84c571908888ada789655bc9a165cb4446 Mon Sep 17 00:00:00 2001 From: Pablo Orviz Date: Wed, 16 Oct 2024 12:29:08 +0200 Subject: [PATCH 2/4] Fix: exception catched when attempting to request an external resource --- api/utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/api/utils.py b/api/utils.py index abb5e3d..6b73e55 100644 --- a/api/utils.py +++ b/api/utils.py @@ -888,14 +888,20 @@ def resolve_handle(handle_id): def check_link(address, return_http_code=False): + resolves = False req = urllib.request.Request(url=address) - resp = urllib.request.urlopen(req) - if return_http_code: - return resp.status - if resp.status in [400, 404, 403, 408, 409, 501, 502, 503]: - return False + try: + resp = urllib.request.urlopen(req) + except urllib.error.HTTPError as e: + logging.warning("Could not access to resource: %s" % address) else: - return True + http_code = resp.status + logging.debug("Returned HTTP status from '%s': %s" % (address, http_code)) + if return_http_code: + return http_code + if http_code not in ["400", "404", "403", "408", "409", "501", "502", "503"]: + resolves = True + return resolves def get_protocol_scheme(url): From 072be11d2f6d0ae0c29c16066514396263a1af84 Mon Sep 17 00:00:00 2001 From: Pablo Orviz Date: Wed, 16 Oct 2024 12:29:39 +0200 Subject: [PATCH 3/4] Fix: better handling of non-URI values --- plugins/epos/plugin.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/plugins/epos/plugin.py b/plugins/epos/plugin.py index bc16341..82bf7b8 100644 --- a/plugins/epos/plugin.py +++ b/plugins/epos/plugin.py @@ -827,17 +827,22 @@ def rda_a1_03d(self, **kwargs): for uri in data_access_uri: resolves = False schemes = idutils.detect_identifier_schemes(uri) - logger.debug("Identifier schemes found: %s" % schemes) - if "doi" in schemes or "handle" in schemes: - resolves = ut.resolve_handle(uri)[0] - elif "url" in schemes: - resolves = ut.check_link(uri) + if not schemes: + logger.warning("Could not get the scheme/s from the value: %s" % uri) else: - logger.warning( - "Scheme/s used by the identifier not known: %s" % schemes + logger.debug( + "Identifier schemes found for the value '%s': %s" % (uri, schemes) ) - if resolves: - resolvable_uris.append(uri) + if "doi" in schemes or "handle" in schemes: + resolves = ut.resolve_handle(uri)[0] + elif "url" in schemes: + resolves = ut.check_link(uri) + else: + logger.warning( + "Scheme/s used by the identifier not known: %s" % schemes + ) + if resolves: + resolvable_uris.append(uri) resolvable_uris_num = len(resolvable_uris) if resolvable_uris: From d0c24eff87e4d39a74e84d634d3c18fa92c85efe Mon Sep 17 00:00:00 2001 From: Pablo Orviz Date: Wed, 16 Oct 2024 15:31:35 +0200 Subject: [PATCH 4/4] Do not fail when TimeoutError exception is raised --- api/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/api/utils.py b/api/utils.py index 6b73e55..55f18f0 100644 --- a/api/utils.py +++ b/api/utils.py @@ -891,7 +891,9 @@ def check_link(address, return_http_code=False): resolves = False req = urllib.request.Request(url=address) try: - resp = urllib.request.urlopen(req) + resp = urllib.request.urlopen(req, timeout=15) + except urllib.error.URLError as e: + logging.warning("Timeout reached while trying to connect to '%s'" % address) except urllib.error.HTTPError as e: logging.warning("Could not access to resource: %s" % address) else: