From 3a9b01c3bfafb403719cc516c93d2d336ee5bb28 Mon Sep 17 00:00:00 2001 From: huberrob Date: Fri, 22 Mar 2024 11:33:46 +0100 Subject: [PATCH] removed some debug prints; removed a condition which checked for html content which caused #492 and also caused unwanted behaviour for datacite (doi cc) exclusion; changed some logger messages; added missing pid_url handover from external metadata harvester which caused #492; fixed typo in metrics yaml; changed version to 3.2.0; added a file touch after failed datacite id update to avoid #489 --- fuji_server/controllers/fair_check.py | 6 ++- fuji_server/harvester/data_harvester.py | 2 +- fuji_server/harvester/metadata_harvester.py | 39 ++++++++++++-------- fuji_server/helper/metadata_collector_rdf.py | 2 +- fuji_server/helper/preprocessor.py | 3 +- fuji_server/yaml/metrics_v0.5.yaml | 2 +- pyproject.toml | 2 +- 7 files changed, 34 insertions(+), 22 deletions(-) diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index 04b819d9..11c10ff3 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -110,7 +110,7 @@ def __init__( self.pid_url = None # full pid # e.g., "https://doi.org/10.1594/pangaea.906092 or url (non-pid) self.landing_url = None # url of the landing page of self.pid_url self.origin_url = None # the url from where all starts - in case of redirection we'll need this later on - self.repository_urls = [] # urls identified which could represent the repository + self.repository_urls = [] # urls identified which could represent the repository will need this probably for FAIRiCAT things self.landing_html = None self.landing_content_type = None self.landing_origin = None # schema + authority of the landing page e.g. https://www.pangaea.de @@ -388,6 +388,8 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False): self.linked_namespace_uri.update(self.metadata_harvester.linked_namespace_uri) self.related_resources.extend(self.metadata_harvester.related_resources) self.metadata_harvester.get_signposting_object_identifier() + self.pid_url = self.metadata_harvester.pid_url + self.pid_scheme = self.metadata_harvester.pid_scheme self.pid_collector.update(self.metadata_harvester.pid_collector) """def lookup_metadatastandard_by_name(self, value): @@ -648,4 +650,4 @@ def set_repository_uris(self): self.repository_urls.append(publisher_url) if self.repository_urls: self.repository_urls = list(set(self.repository_urls)) - print("REPOSITORY: ", self.repository_urls) + # print("REPOSITORY: ", self.repository_urls) diff --git a/fuji_server/harvester/data_harvester.py b/fuji_server/harvester/data_harvester.py index d383aa97..daf2dc1a 100644 --- a/fuji_server/harvester/data_harvester.py +++ b/fuji_server/harvester/data_harvester.py @@ -113,7 +113,7 @@ def retrieve_all_data(self, scan_content=True): timeout = 10 if len(ft) > self.max_number_per_mime: self.logger.warning( - f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s}" + f"FsF-F3-01M : Found more than -: {self.max_number_per_mime!s} data links (out of {len(ft)!s}) of type {fmime} will only take {self.max_number_per_mime!s} for content analysis" ) files_to_check = ft[: self.max_number_per_mime] # add the fifth one for compatibility reasons < f-uji 3.0.1, when we took the last of list of length FILES_LIMIT diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 133682a8..8351be71 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -263,17 +263,22 @@ def check_if_pid_resolves_to_landing_page(self, pid_url=None): candidate_landing_url = self.pid_collector[pid_url].get("resolved_url") if candidate_landing_url and self.landing_url: candidate_landing_url_parts = extract(candidate_landing_url) + # print(candidate_landing_url_parts ) # landing_url_parts = extract(self.landing_url) input_id_domain = candidate_landing_url_parts.domain + "." + candidate_landing_url_parts.suffix # landing_domain = landing_url_parts.domain + "." + landing_url_parts.suffix if self.landing_domain != input_id_domain: self.logger.warning( "FsF-F1-02D : Landing page domain resolved from PID found in metadata does not match with input URL domain -:" - + str(pid_url) + + str(self.landing_domain) + + " <> " + + str(input_id_domain) ) self.logger.warning( "FsF-F2-01M : Landing page domain resolved from PID found in metadata does not match with input URL domain -:" - + str(pid_url) + + str(self.landing_domain) + + " <> " + + str(input_id_domain) ) return False else: @@ -322,6 +327,7 @@ def check_pidtest_repeat(self): if idhelper.is_persistent and validated: found_pids[found_id_scheme] = idhelper.get_identifier_url() if len(found_pids) >= 1 and self.repeat_pid_check is False: + # print(found_pids, next(iter(found_pids.items()))) self.logger.info( "FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D" ) @@ -702,17 +708,17 @@ def retrieve_metadata_embedded(self): self.logger.error("FsF-F2-01M : Resource inaccessible -: " + str(e)) pass - if self.landing_url and self.is_html_page: + if self.landing_url: if self.landing_url not in ["https://datacite.org/invalid.html"]: if response_status == 200: if "html" in requestHelper.content_type: self.raise_warning_if_javascript_page(requestHelper.response_content) - up = urlparse(self.landing_url) upp = extract(self.landing_url) self.landing_origin = f"{up.scheme}://{up.netloc}" self.landing_domain = upp.domain + "." + upp.suffix - self.landing_html = requestHelper.getResponseContent() + if self.is_html_page: + self.landing_html = requestHelper.getResponseContent() self.landing_content_type = requestHelper.content_type self.landing_redirect_list = requestHelper.redirect_list self.landing_redirect_status_list = requestHelper.redirect_status_list @@ -1441,16 +1447,19 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False): target_url_list = [self.origin_url, self.landing_url] # specific target url if isinstance(target_url, str): - target_url_list = [target_url] - - target_url_list = set(tu for tu in target_url_list if tu is not None) - self.retrieve_metadata_external_xml_negotiated(target_url_list) - self.retrieve_metadata_external_schemaorg_negotiated(target_url_list) - self.retrieve_metadata_external_rdf_negotiated(target_url_list) - self.retrieve_metadata_external_datacite() - if not repeat_mode: - self.retrieve_metadata_external_linked_metadata() - self.retrieve_metadata_external_oai_ore() + if self.use_datacite is False and "doi" == self.pid_scheme: + target_url_list = [] + else: + target_url_list = [target_url] + if target_url_list: + target_url_list = set(tu for tu in target_url_list if tu is not None) + self.retrieve_metadata_external_xml_negotiated(target_url_list) + self.retrieve_metadata_external_schemaorg_negotiated(target_url_list) + self.retrieve_metadata_external_rdf_negotiated(target_url_list) + self.retrieve_metadata_external_datacite() + if not repeat_mode: + self.retrieve_metadata_external_linked_metadata() + self.retrieve_metadata_external_oai_ore() """if self.reference_elements: self.logger.debug(f"FsF-F2-01M : Reference metadata elements NOT FOUND -: {self.reference_elements}") diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index 54f0b319..6c00ec80 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -1008,7 +1008,7 @@ def get_dcat_metadata(self, graph): datasets = list(graph[: RDF.type : DCAT.Dataset]) table = list(graph[: RDF.type : CSVW.Column]) - print("TABLE", len(table)) + # print("TABLE", len(table)) if len(datasets) > 1: self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one") if len(datasets) > 0: diff --git a/fuji_server/helper/preprocessor.py b/fuji_server/helper/preprocessor.py index bc80739b..aafc955e 100644 --- a/fuji_server/helper/preprocessor.py +++ b/fuji_server/helper/preprocessor.py @@ -230,7 +230,7 @@ def retrieve_datacite_re3repos(cls): print("updating re3data dois") p = {"query": "re3data_id:*"} try: - req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header) + req = requests.get(cls.DATACITE_API_REPO, params=p, headers=cls.header, timeout=5) raw = req.json() for r in raw["data"]: cls.re3repositories[r["id"]] = r["attributes"]["re3data"] @@ -245,6 +245,7 @@ def retrieve_datacite_re3repos(cls): yaml.dump(cls.re3repositories, f2) except requests.exceptions.RequestException as e: + os.utime(re3dict_path) print("Preprocessor Error: " + str(e)) cls.logger.error(e) diff --git a/fuji_server/yaml/metrics_v0.5.yaml b/fuji_server/yaml/metrics_v0.5.yaml index eea6a4b8..3fe77646 100644 --- a/fuji_server/yaml/metrics_v0.5.yaml +++ b/fuji_server/yaml/metrics_v0.5.yaml @@ -187,7 +187,7 @@ metrics: metric_test_score: 1 metric_test_maturity: 3 metric_test_requirements: - - target: http://f-uji.net/vocab/metadata/sources + - target: http://f-uji.net/vocab/metadata/standard modality: any required: name: diff --git a/pyproject.toml b/pyproject.toml index 82a2b9a7..c58e01f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ license = "MIT" name = "fuji" readme = "README.md" requires-python = "~=3.11" # at the moment only Python 3.11 is supported -version = "3.1.1" +version = "3.2.0" [project.optional-dependencies] dev = [