diff --git a/fuji_server/config/server.ini b/fuji_server/config/server.ini index c3902d74..8f86ca66 100644 --- a/fuji_server/config/server.ini +++ b/fuji_server/config/server.ini @@ -9,9 +9,10 @@ debug_mode = true data_files_limit = 5 log_config = config/logging.ini logdir = logs +verify_pids = false # the URI which triggers the remote logging all other F-UJI server requests are ignored -remote_log_host = fuji.localhost -remote_log_path = /loghandler/index.php +remote_log_host = +remote_log_path = rate_limit = 100 per minute # limits the maximum size of content (metadata) which can be downloaded max_content_size = 5000000 diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index 01604700..ff35906d 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -4,6 +4,7 @@ import hashlib import io +import json import logging import logging.handlers import re @@ -50,7 +51,7 @@ from fuji_server.harvester.data_harvester import DataHarvester from fuji_server.harvester.github_harvester import GithubHarvester from fuji_server.harvester.metadata_harvester import MetadataHarvester -from fuji_server.helper.linked_vocab_helper import linked_vocab_helper +from fuji_server.helper.linked_vocab_helper import LinkedVocabHelper from fuji_server.helper.metadata_collector import MetadataOfferingMethods from fuji_server.helper.metadata_mapper import Mapper from fuji_server.helper.metric_helper import MetricHelper @@ -70,7 +71,7 @@ class FAIRCheck: LONG_TERM_FILE_FORMATS = None OPEN_FILE_FORMATS = None DEFAULT_NAMESPACES = None - VOCAB_NAMESPACES = None + # VOCAB_NAMESPACES = None ARCHIVE_MIMETYPES = Mapper.ARCHIVE_COMPRESS_MIMETYPES.value STANDARD_PROTOCOLS = None SCHEMA_ORG_CONTEXT = [] @@ -184,7 +185,7 @@ def __init__( FAIRCheck.load_predata() # self.extruct = None self.extruct_result = {} - self.lov_helper = linked_vocab_helper(self.LINKED_VOCAB_INDEX) + self.lov_helper = LinkedVocabHelper(self.LINKED_VOCAB_INDEX) self.auth_token = None self.auth_token_type = "Basic" @@ -241,8 +242,8 @@ def load_predata(cls): cls.OPEN_FILE_FORMATS = Preprocessor.get_open_file_formats() if not cls.DEFAULT_NAMESPACES: cls.DEFAULT_NAMESPACES = Preprocessor.getDefaultNamespaces() - if not cls.VOCAB_NAMESPACES: - cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs() + # if not cls.VOCAB_NAMESPACES: + # cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs() if not cls.STANDARD_PROTOCOLS: cls.STANDARD_PROTOCOLS = Preprocessor.get_standard_protocols() if not cls.SCHEMA_ORG_CONTEXT: @@ -282,43 +283,70 @@ def set_auth_token(self, auth_token, auth_token_type="Basic"): self.auth_token_type = "Basic" def clean_metadata(self): + # replace nasty "None" strings by real None + try: + nonerepdict = json.dumps(self.metadata_merged).replace('"None"', "null") + self.metadata_merged = json.loads(nonerepdict) + except: + print("Nasty None replace error") + pass data_objects = self.metadata_merged.get("object_content_identifier") if data_objects == {"url": None} or data_objects == [None]: data_objects = self.metadata_merged["object_content_identifier"] = None if data_objects is not None: if not isinstance(data_objects, list): self.metadata_merged["object_content_identifier"] = [data_objects] - - # TODO quick-fix to merge size information - should do it at mapper - if "object_content_identifier" in self.metadata_merged: - if self.metadata_merged.get("object_content_identifier"): - oi = 0 - for c in self.metadata_merged["object_content_identifier"]: - if not c.get("size") and self.metadata_merged.get("object_size"): - c["size"] = self.metadata_merged.get("object_size") - # clean mime types in case these are in URI form: - if c.get("type"): - if isinstance(c["type"], list): - c["type"] = c["type"][0] - self.metadata_merged["object_content_identifier"][oi]["type"] = c["type"][0] - mime_parts = str(c.get("type")).split("/") - if len(mime_parts) > 2: - if mime_parts[-2] in [ - "application", - "audio", - "font", - "example", - "image", - "message", - "model", - "multipart", - "text", - "video", - ]: - self.metadata_merged["object_content_identifier"][oi]["type"] = ( - str(mime_parts[-2]) + "/" + str(mime_parts[-1]) - ) - oi += 1 + # duplicate handling + if self.metadata_merged.get("object_content_identifier"): + fdci = {} + for dci in self.metadata_merged.get("object_content_identifier"): + dcurl = dci.get("url") + if dcurl not in fdci: + fdci[dcurl] = dci + else: + # complete size and type + if not fdci[dcurl].get("type") and dci.get("type"): + fdci[dcurl]["type"] = dci.get("type") + if not fdci[dcurl].get("size") and dci.get("size"): + fdci[dcurl]["size"] = dci.get("size") + self.metadata_merged["object_content_identifier"] = [di for di in fdci.values()] + + # if "object_content_identifier" in self.metadata_merged: + # if self.metadata_merged.get("object_content_identifier"): + oi = 0 + for c in self.metadata_merged["object_content_identifier"]: + if ( + not c.get("size") + and self.metadata_merged.get("object_size") + and len(self.metadata_merged["object_content_identifier"]) == 1 + ): + # c["size"] = self.metadata_merged.get("object_size") + self.metadata_merged["object_content_identifier"][oi]["size"] = self.metadata_merged.get( + "object_size" + ) + # clean mime types in case these are in URI form: + if c.get("type"): + if isinstance(c["type"], list): + c["type"] = c["type"][0] + self.metadata_merged["object_content_identifier"][oi]["type"] = c["type"][0] + mime_parts = str(c.get("type")).split("/") + if len(mime_parts) > 2: + if mime_parts[-2] in [ + "application", + "audio", + "font", + "example", + "image", + "message", + "model", + "multipart", + "text", + "video", + ]: + self.metadata_merged["object_content_identifier"][oi]["type"] = ( + str(mime_parts[-2]) + "/" + str(mime_parts[-1]) + ) + oi += 1 # clean empty entries for mk, mv in list(self.metadata_merged.items()): if mv == "" or mv is None: @@ -328,7 +356,6 @@ def harvest_all_metadata(self): # ========= clean merged metadata, delete all entries which are None or '' self.retrieve_metadata_embedded() self.retrieve_metadata_external() - self.clean_metadata() self.logger.info( "FsF-F2-01M : Type of object described by the metadata -: {}".format( self.metadata_merged.get("object_type") @@ -374,8 +401,8 @@ def harvest_github(self): def retrieve_metadata_embedded(self): self.metadata_harvester.retrieve_metadata_embedded() - self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged) - self.metadata_merged.update(self.metadata_harvester.metadata_merged) + # self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged) + # self.metadata_merged.update(self.metadata_harvester.metadata_merged) self.repeat_pid_check = self.metadata_harvester.repeat_pid_check self.namespace_uri.extend(self.metadata_harvester.namespace_uri) self.metadata_sources.extend(self.metadata_harvester.metadata_sources) @@ -391,8 +418,8 @@ def retrieve_metadata_embedded(self): def retrieve_metadata_external(self, target_url=None, repeat_mode=False): self.metadata_harvester.retrieve_metadata_external(target_url, repeat_mode=repeat_mode) - self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged) - self.metadata_merged.update(self.metadata_harvester.metadata_merged) + # self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged) + # self.metadata_merged.update(self.metadata_harvester.metadata_merged) self.repeat_pid_check = self.metadata_harvester.repeat_pid_check self.namespace_uri.extend(self.metadata_harvester.namespace_uri) self.metadata_sources.extend(self.metadata_harvester.metadata_sources) @@ -403,35 +430,10 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False): self.pid_scheme = self.metadata_harvester.pid_scheme self.pid_collector.update(self.metadata_harvester.pid_collector) - """def lookup_metadatastandard_by_name(self, value): - found = None - # get standard name with the highest matching percentage using fuzzywuzzy - highest = process.extractOne(value, FAIRCheck.COMMUNITY_METADATA_STANDARDS_NAMES, scorer=fuzz.token_sort_ratio) - if highest[1] > 80: - found = highest[2] - return found - - def lookup_metadatastandard_by_uri(self, value): - found = None - if value: - value = str(value).strip().strip('#/') - # try to find it as direct match using http or https as prefix - if value.startswith('http') or value.startswith('ftp'): - value = value.replace('s://', '://') - found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value) - if not found: - found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value.replace('://', 's://')) - if not found: - #fuzzy as fall back - try: - match = process.extractOne(value, - FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.keys()) - if extract(str(value)).domain == extract(str(match[1]).domain): - if match[1] > 90: - found = list(FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.values())[match[2]] - except Exception as e: - pass - return found""" + def set_harvested_metadata(self): + self.metadata_unmerged = self.metadata_harvester.metadata_unmerged + self.metadata_merged = self.metadata_harvester.metadata_merged + self.clean_metadata() def check_unique_metadata_identifier(self): unique_identifier_check = FAIREvaluatorUniqueIdentifierMetadata(self) @@ -450,7 +452,6 @@ def check_persistent_data_identifier(self): return persistent_identifier_check.getResult() def check_unique_persistent_metadata_identifier(self): - # self.metadata_harvester.get_signposting_object_identifier() return self.check_unique_metadata_identifier(), self.check_persistent_metadata_identifier() def check_unique_persistent_software_identifier(self): @@ -692,8 +693,18 @@ def set_repository_uris(self): self.metadata_merged["publisher"] = [self.metadata_merged.get("publisher")] for publisher_url in self.metadata_merged.get("publisher"): if self.uri_validator(publisher_url): - if self.landing_domain in publisher_url: + if self.landing_domain in publisher_url and publisher_url.count("/") <= 3: self.repository_urls.append(publisher_url) if self.repository_urls: self.repository_urls = list(set(self.repository_urls)) - # print("REPOSITORY: ", self.repository_urls) + print("REPOSITORY URIS: ", self.repository_urls) + + def set_repository_info(self): + self.set_repository_uris() + if self.repository_urls: + for repo_uri in self.repository_urls: + repoharvester = MetadataHarvester(repo_uri) + repoharvester.retrieve_metadata_embedded() + repoharvester.retrieve_metadata_external() + print("########################### REPO METADATA") + print(repoharvester.metadata_merged) diff --git a/fuji_server/controllers/fair_object_controller.py b/fuji_server/controllers/fair_object_controller.py index 39dfe186..94d134ed 100644 --- a/fuji_server/controllers/fair_object_controller.py +++ b/fuji_server/controllers/fair_object_controller.py @@ -11,6 +11,7 @@ from fuji_server.helper.preprocessor import Preprocessor from fuji_server.helper.results_exporter import FAIRResultsMapper from fuji_server.models.fair_results import FAIRResults +from fuji_server.models.harvest_results_metadata import HarvestResultsMetadata async def assess_by_id(body): @@ -80,9 +81,12 @@ async def assess_by_id(body): print("starting harvesting ") ft.harvest_all_metadata() + ft.set_harvested_metadata() uid_result, pid_result = ft.check_unique_persistent_metadata_identifier() if ft.repeat_pid_check: ft.retrieve_metadata_external(ft.pid_url, repeat_mode=True) + ft.set_harvested_metadata() + ft.clean_metadata() ft.harvest_re3_data() ft.harvest_github() core_metadata_result = ft.check_minimal_metatadata() @@ -210,6 +214,20 @@ async def assess_by_id(body): idhelper = IdentifierHelper(ft.pid_url) request["normalized_object_identifier"] = idhelper.get_normalized_id() results.sort(key=lambda d: d["id"]) # sort results by metric ID + #### metadata summary + harvest_result = [] + for metadata in ft.metadata_unmerged: + harvest_result.append( + HarvestResultsMetadata( + metadata.get("offering_method"), + metadata.get("url"), + metadata.get("format"), + metadata.get("schema"), + metadata.get("namespaces"), + metadata.get("metadata"), + ) + ) + ### final_response = FAIRResults( request=request, start_timestamp=starttimestmp, @@ -222,6 +240,7 @@ async def assess_by_id(body): results=results, summary=summary, resolved_url=resolved_url, + harvested_metadata=harvest_result, ) accept_header = connexion.request.headers.get("Accept") print("ACCEPT HEADER ", accept_header) diff --git a/fuji_server/evaluators/fair_evaluator_data_content_metadata.py b/fuji_server/evaluators/fair_evaluator_data_content_metadata.py index 5276b47e..cb10460e 100644 --- a/fuji_server/evaluators/fair_evaluator_data_content_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_data_content_metadata.py @@ -39,15 +39,21 @@ def subtestDataContentInfoGiven(self): def subtestResourceTypeGiven(self): test_result = False test_score = self.getTestConfigScore(self.metric_identifier + "-1a") + is_dataset = False resource_types = self.fuji.metadata_merged.get("object_type") + found_resource_types = [] if resource_types: if not isinstance(resource_types, list): resource_types = [resource_types] + for resource_type in resource_types: resource_type = str(resource_type).lower() if str(resource_type).startswith("http"): # http://schema.org/Dataset resource_type = str(resource_type).split("/")[-1] + found_resource_types.append(resource_type) + if "dataset" in resource_type: + is_dataset = True if ( str(resource_type).lower() in self.fuji.VALID_RESOURCE_TYPES or resource_type in self.fuji.SCHEMA_ORG_CONTEXT @@ -68,8 +74,15 @@ def subtestResourceTypeGiven(self): + " : Invalid resource type (e.g. subtype of schema.org/CreativeWork, DCMI Type or DataCite resourceType) specified -: " + str(resource_type) ) - else: - self.logger.warning(self.metric_identifier + " : NO resource type specified ") + if not is_dataset: + self.logger.error( + self.metric_identifier + + " : The evaluated resource does not identify itself as a “dataset” but as " + + str(found_resource_types) + + ", so F-UJI may not be the right tool for this type of resource " + ) + # else: + # self.logger.warning(self.metric_identifier + " : NO resource type specified ") return test_result def testMinimalInformationAboutDataContentAvailable(self): @@ -98,15 +111,37 @@ def subtestDataTypeAndSizeGiven(self, test_data_content_url): test_result = True self.setEvaluationCriteriumScore(self.metric_identifier + "-2a", 0, "pass") self.logger.log( - self.fuji.LOG_SUCCESS, self.metric_identifier + " : Found file size and type specified in metadata" + self.fuji.LOG_SUCCESS, + self.metric_identifier + + f" : Found file size and type specified in metadata for -: {test_data_content_url}", ) elif not data_object.get("claimed_type"): self.logger.warning( - f"{self.metric_identifier} : NO info about file type available in given metadata -: " + f"{self.metric_identifier} : NO info about file type available in given metadata for -: {test_data_content_url}" ) else: self.logger.warning( - f"{self.metric_identifier} : NO info about file size available in given metadata -: " + f"{self.metric_identifier} : NO info about file size available in given metadata for -: {test_data_content_url}" + ) + return test_result + + def subtestServiceProtocolServiceEndpointGiven(self, test_data_content_url): + test_result = False + if test_data_content_url: + data_object = self.fuji.content_identifier.get(test_data_content_url) + # print(data_object) + if data_object.get("claimed_service") and data_object.get("url"): + print("SERVICE and URL GIVEN ", type(data_object.get("claimed_service"))) + test_result = True + self.setEvaluationCriteriumScore(self.metric_identifier + "-2c", 0, "pass") + self.logger.log( + self.fuji.LOG_SUCCESS, + self.metric_identifier + + f" : Found data service endpoint and protocol specified in metadata for -: {test_data_content_url}", + ) + elif not data_object.get("claimed_service"): + self.logger.info( + f"{self.metric_identifier} : NO info about data service endpoint available in given metadata for -: {test_data_content_url}" ) return test_result @@ -129,6 +164,8 @@ def testVerifiableDataDescriptorsAvailable(self, test_data_content_url): if test_data_content_url: if self.subtestDataTypeAndSizeGiven(test_data_content_url): test_result = True + if self.subtestServiceProtocolServiceEndpointGiven(test_data_content_url): + test_result = True if self.subtestMeasuredVariablesGiven(): test_result = True if test_result and self.metric_identifier + "-2" not in self.test_passed: @@ -138,14 +175,15 @@ def testVerifiableDataDescriptorsAvailable(self, test_data_content_url): self.maturity = self.metric_tests.get(self.metric_identifier + "-2").metric_test_maturity_config return test_result - def testSizeAndTypeMatchesMetadata(self, test_data_content_url): + def testSizeAndTypeOrProtocolMatchesMetadata(self, test_data_content_url): test_result = False size_matches = False type_matches = False + protocol_matches = False if self.isTestDefined(self.metric_identifier + "-3"): test_score = self.getTestConfigScore(self.metric_identifier + "-3") data_object = self.fuji.content_identifier.get(test_data_content_url) - if data_object.get("claimed_type") and data_object.get("claimed_size"): + if data_object.get("claimed_type") or data_object.get("claimed_size") or data_object.get("claimed_service"): if not isinstance(data_object.get("tika_content_type"), list): data_object["tika_content_type"] = [data_object.get("tika_content_type")] if data_object.get("content_size") and data_object.get("claimed_size"): @@ -183,11 +221,18 @@ def testSizeAndTypeMatchesMetadata(self, test_data_content_url): str(data_object.get("content_size")), ) ) - data_content_filesize_inner = DataContentMetadataOutputInner() - data_content_filesize_inner.descriptor = "file size" - data_content_filesize_inner.descriptor_value = data_object.get("claimed_size") - data_content_filesize_inner.matches_content = size_matches - self.data_content_descriptors.append(data_content_filesize_inner) + else: + self.logger.info( + "{} : No content size given for downloaded file -: {}".format( + self.metric_identifier, + str(data_object.get("url")), + ) + ) + data_content_filesize_inner = DataContentMetadataOutputInner() + data_content_filesize_inner.descriptor = "file size" + data_content_filesize_inner.descriptor_value = data_object.get("claimed_size") + data_content_filesize_inner.matches_content = size_matches + self.data_content_descriptors.append(data_content_filesize_inner) except Exception: self.logger.warning( "{} : Could not verify content size from downloaded file -: (expected: {}, found: {})".format( @@ -225,12 +270,32 @@ def testSizeAndTypeMatchesMetadata(self, test_data_content_url): + str(data_object.get("header_content_type")), ) ) - data_content_filetype_inner = DataContentMetadataOutputInner() - data_content_filetype_inner.descriptor = "file type" - data_content_filetype_inner.descriptor_value = data_object.get("claimed_type") - data_content_filetype_inner.matches_content = type_matches - self.data_content_descriptors.append(data_content_filetype_inner) - if size_matches and type_matches and self.metric_identifier + "-3" not in self.test_passed: + if data_object.get("claimed_service"): + protocol_mime_types = ["application/xml", "text/xml", "application/ld+json", " application/json"] + if data_object.get("tika_content_type"): + for tika_type in data_object.get("tika_content_type"): + if tika_type in protocol_mime_types: + protocol_matches = True + self.logger.info( + "{} : Sucessfully verified commonly used protocol mime type -: (expected: {}, found: via tika {})".format( + self.metric_identifier, + protocol_mime_types, + str(data_object.get("tika_content_type")), + ) + ) + data_content_protocol_inner = DataContentMetadataOutputInner() + data_content_protocol_inner.descriptor = "data protocol" + data_content_protocol_inner.descriptor_value = data_object.get("claimed_service") + data_content_protocol_inner.matches_content = protocol_matches + self.data_content_descriptors.append(data_content_protocol_inner) + data_content_filetype_inner = DataContentMetadataOutputInner() + data_content_filetype_inner.descriptor = "file type" + data_content_filetype_inner.descriptor_value = data_object.get("claimed_type") + data_content_filetype_inner.matches_content = type_matches + self.data_content_descriptors.append(data_content_filetype_inner) + if ( + (size_matches and type_matches) or protocol_matches + ) and self.metric_identifier + "-3" not in self.test_passed: self.test_passed.append(self.metric_identifier + "-3") self.score.earned += test_score self.setEvaluationCriteriumScore(self.metric_identifier + "-3", test_score, "pass") @@ -300,10 +365,12 @@ def evaluate(self): for test_data_content_url in test_data_content_urls: if self.testVerifiableDataDescriptorsAvailable(test_data_content_url): test_status = "pass" - if self.testSizeAndTypeMatchesMetadata(test_data_content_url): + if self.testSizeAndTypeOrProtocolMatchesMetadata(test_data_content_url): test_status = "pass" if self.testVariablesMatchMetadata(test_data_content_url): test_status = "pass" + if self.subtestServiceProtocolServiceEndpointGiven(test_data_content_url): + test_status = "pass" else: self.logger.warning( self.metric_identifier diff --git a/fuji_server/evaluators/fair_evaluator_persistent_identifier_metadata.py b/fuji_server/evaluators/fair_evaluator_persistent_identifier_metadata.py index e1b82fa8..345cfe8c 100644 --- a/fuji_server/evaluators/fair_evaluator_persistent_identifier_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_persistent_identifier_metadata.py @@ -41,67 +41,82 @@ def setPidsOutput(self): output_inner.resolved_url = pid_info.get("resolved_url") self.output.persistent_identifiers.append(output_inner) - def testCompliesWithPIDScheme(self): + def testCompliesWithPIDScheme(self, pid_dict): test_status = False + remaining_pid_dict = {} if self.isTestDefined(self.metric_identifier + "-1"): test_score = self.getTestConfigScore(self.metric_identifier + "-1") - for pid, pid_info in self.fuji.pid_collector.items(): - if pid_info.get("verified"): - if pid_info.get("is_persistent"): - test_status = True - else: - self.logger.warning( - self.metric_identifier - + " : Skipping PID syntax test since the PID seems to resolve to a different entity" - ) - if test_status: + for pid, pid_info in pid_dict.items(): + if pid_info.get("is_persistent"): + remaining_pid_dict[pid] = pid_info + # for older versions of metric (<0.6)which do not test this separately + if not self.isTestDefined(self.metric_identifier + "-3") and not pid_info.get("verified"): + remaining_pid_dict.pop(pid, None) + self.logger.warning( + self.metric_identifier + + " : Skipping PID syntax test since the PID seems to resolve to a different entity" + ) + if remaining_pid_dict: self.setEvaluationCriteriumScore(self.metric_identifier + "-1", test_score, "pass") self.score.earned = test_score self.maturity = self.metric_tests.get(self.metric_identifier + "-1").metric_test_maturity_config test_status = True - return test_status + return test_status, remaining_pid_dict - def testIfLandingPageResolves(self): + def testIfPersistentIdentifierResolves(self, pid_dict): test_status = False + remaining_pid_dict = {} if self.isTestDefined(self.metric_identifier + "-2"): test_score = self.getTestConfigScore(self.metric_identifier + "-2") - for pid, pid_info in self.fuji.pid_collector.items(): - if self.fuji.verify_pids: + for pid, pid_info in pid_dict.items(): + if pid_info.get("resolved_url"): + remaining_pid_dict[pid] = pid_info self.fuji.isLandingPageAccessible = True - self.logger.info( + self.logger.log( + self.fuji.LOG_SUCCESS, self.metric_identifier - + " : Found PID which was not verified (if it does resolve properly) due to config settings -: " - + str(pid) + + " : Found PID which resolves properly to e.g. a landing page-: " + + str(pid), ) - elif pid_info.get("verified"): - if pid_info.get("resolved_url"): - self.fuji.isLandingPageAccessible = True - self.logger.info( - self.metric_identifier - + " : Found PID which could be verified (does resolve properly) -: " - + str(pid) - ) - else: - self.logger.info( - self.metric_identifier - + " : Found PID which could not be verified (no landing page found) -: " - + str(pid) - ) else: - self.logger.info( + self.logger.warning( self.metric_identifier - + " : Found PID which could not be verified (does not resolve properly) -: " + + " : Found PID which could not be verified (no landing page found) -: " + str(pid) ) if self.fuji.isLandingPageAccessible: test_status = True self.setEvaluationCriteriumScore(self.metric_identifier + "-2", test_score, "pass") self.maturity = self.metric_tests.get(self.metric_identifier + "-2").metric_test_maturity_config - self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable + self.score.earned += test_score # idenfier should be based on a persistence scheme and resolvable self.logger.log( self.fuji.LOG_SUCCESS, self.metric_identifier + f" : Persistence identifier scheme -: {self.fuji.pid_scheme}", ) + return test_status, remaining_pid_dict + + def testIfPersistentIdentifierResolvestoDomain(self, pid_dict): + test_status = False + if self.isTestDefined(self.metric_identifier + "-3"): + test_score = self.getTestConfigScore(self.metric_identifier + "-3") + for pid, pid_info in pid_dict.items(): + if pid_info.get("verified"): + self.logger.log( + self.fuji.LOG_SUCCESS, + self.metric_identifier + + " : Found PID could be verified, it resolves back to the domain of landing page-: " + + str(pid), + ) + test_status = True + self.setEvaluationCriteriumScore(self.metric_identifier + "-3", test_score, "pass") + self.maturity = self.metric_tests.get(self.metric_identifier + "-3").metric_test_maturity_config + self.score.earned = self.total_score + else: + self.logger.warning( + self.metric_identifier + + " : Found PID could NOT be verified since it resolves to a different domain than those of the landing page-: " + + str(pid) + ) return test_status def evaluate(self): @@ -119,9 +134,15 @@ def evaluate(self): self.result.test_status = "fail" self.setPidsOutput() - if self.testCompliesWithPIDScheme(): + input_pid_dict = self.fuji.pid_collector + rest_pid_dict = {} + test_status, rest_pid_dict = self.testCompliesWithPIDScheme(input_pid_dict) + if test_status: + self.result.test_status = "pass" + test_status, rest_pid_dict = self.testIfPersistentIdentifierResolves(rest_pid_dict) + if test_status: self.result.test_status = "pass" - if self.testIfLandingPageResolves(): + if self.testIfPersistentIdentifierResolvestoDomain(rest_pid_dict): self.result.test_status = "pass" """else: diff --git a/fuji_server/evaluators/fair_evaluator_searchable.py b/fuji_server/evaluators/fair_evaluator_searchable.py index bbcb3776..16ba6a5d 100644 --- a/fuji_server/evaluators/fair_evaluator_searchable.py +++ b/fuji_server/evaluators/fair_evaluator_searchable.py @@ -31,7 +31,7 @@ def __init__(self, fuji_instance): FAIREvaluator.__init__(self, fuji_instance) self.set_metric("FsF-F4-01M") self.search_mechanisms = [] - self.search_engines_support_offering = ["json_in_html", "meta_tags", "microdata", "rdfa"] + self.search_engines_support_offering = ["json_in_html", "meta_tag", "microdata", "rdfa"] self.search_engines_support_standards = [ "schemaorg", "dublin-core", @@ -159,10 +159,7 @@ def testSearchEngineCompatibleMetadataAvailable(self): + "Found RDFa like metadata which however is empty thus useless for search engines" ) search_engine_support_match = list(set(search_engine_support_match)) - # OLD WAY # Check search mechanisms based on sources of metadata extracted. - """search_engine_support_match: List[Any] = list( - set(dict(self.fuji.metadata_sources).keys()).intersection(self.search_engines_support))""" if search_engine_support_match: self.setEvaluationCriteriumScore(self.metric_identifier + "-1", test_score, "pass") self.set_maturity(self.getTestConfigMaturity(self.metric_identifier + "-1")) diff --git a/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py b/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py index 324dc36b..9d355d80 100644 --- a/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py +++ b/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py @@ -5,7 +5,7 @@ import fnmatch from fuji_server.evaluators.fair_evaluator import FAIREvaluator -from fuji_server.helper.linked_vocab_helper import linked_vocab_helper +from fuji_server.helper.linked_vocab_helper import LinkedVocabHelper from fuji_server.models.semantic_vocabulary import SemanticVocabulary from fuji_server.models.semantic_vocabulary_output_inner import SemanticVocabularyOutputInner @@ -81,7 +81,7 @@ def testSemanticNamespaceURIsAvailable(self): return test_status def testKnownSemanticResourcesUsed(self): - lov_helper = linked_vocab_helper(self.fuji.LINKED_VOCAB_INDEX) + lov_helper = LinkedVocabHelper(self.fuji.LINKED_VOCAB_INDEX) test_status = False communityspecsdefined = False if self.isTestDefined(self.metric_identifier + "-2"): diff --git a/fuji_server/harvester/data_harvester.py b/fuji_server/harvester/data_harvester.py index daf2dc1a..8f2f98cd 100644 --- a/fuji_server/harvester/data_harvester.py +++ b/fuji_server/harvester/data_harvester.py @@ -19,6 +19,7 @@ class DataHarvester: LOG_FAILURE = 35 def __init__(self, data_links, logger, landing_page=None, auth_token=None, auth_token_type="Basic", metrics=None): + self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; F-UJI)" self.logger = logger self.data_links = data_links self.auth_token = auth_token @@ -150,7 +151,7 @@ def retrieve_all_data(self, scan_content=True): return True def get_url_data_and_info(self, urldict, timeout): - header = {"Accept": "*/*", "User-Agent": "F-UJI"} + header = {"Accept": "*/*", "User-Agent": self.user_agent} if self.auth_token: header["Authorization"] = self.auth_token_type + " " + self.auth_token # header["Range"] = "bytes=0-" + str(self.max_download_size) @@ -194,6 +195,8 @@ def set_data_info(self, urldict, response): "url": urldict.get("url"), "claimed_size": urldict.get("size"), "claimed_type": urldict.get("type"), + "claimed_service": urldict.get("service"), + "claimed_profile": urldict.get("profile"), "truncated": False, "is_persistent": False, } @@ -223,16 +226,18 @@ def set_data_info(self, urldict, response): fileinfo["header_content_size"] = response.headers.get("content-length").split(";")[0] elif response.headers.get("Content-Length"): fileinfo["header_content_size"] = response.headers.get("Content-Length").split(";")[0] - try: - fileinfo["header_content_size"] = int(fileinfo["header_content_size"]) - except: - fileinfo["header_content_size"] = self.max_download_size - pass + if fileinfo.get("header_content_size"): + try: + fileinfo["header_content_size"] = int(fileinfo["header_content_size"]) + except: + fileinfo["header_content_size"] = self.max_download_size + pass content = response.read(self.max_download_size) file_buffer_object.write(content) fileinfo["content_size"] = file_buffer_object.getbuffer().nbytes - if fileinfo["content_size"] < fileinfo["header_content_size"]: - fileinfo["truncated"] = True + if fileinfo.get("header_content_size"): + if fileinfo["content_size"] < fileinfo["header_content_size"]: + fileinfo["truncated"] = True if fileinfo["content_size"] > 0: fileinfo.update(self.tika(file_buffer_object, urldict.get("url"))) diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index af797199..3dcd0158 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -5,6 +5,7 @@ import enum import hashlib import io +import json import logging import mimetypes import re @@ -46,6 +47,10 @@ class MetadataHarvester: "author", "linkset", "cite-as", + "api-catalog", + "service-doc", + "service-desc", + "service-meta", ] def __init__( @@ -357,7 +362,7 @@ def set_html_typed_links(self): href = link.attrib.get("href") rel = link.attrib.get("rel") type = link.attrib.get("type") - profile = link.attrib.get("format") + profile = link.attrib.get("profile") type = str(type).strip() # handle relative paths linkparts = urlparse(href) @@ -383,6 +388,7 @@ def set_html_typed_links(self): "cite-as", "linkset", "license", + "api-catalog", ]: if rel in self.signposting_relation_types: source = MetadataOfferingMethods.SIGNPOSTING @@ -407,23 +413,30 @@ def set_signposting_header_links(self, content, header): def set_signposting_linkset_links(self): linksetlinks = [] linksetlink = {} - if self.get_html_typed_links("linkset"): - linksetlinks = self.get_html_typed_links("linkset") - elif self.get_signposting_header_links("linkset"): - linksetlinks = self.get_signposting_header_links("linkset") + if self.get_html_typed_links(["linkset", "api-catalog"]): + linksetlinks = self.get_html_typed_links(["linkset", "api-catalog"]) + elif self.get_signposting_header_links(["linkset", "api-catalog"]): + linksetlinks = self.get_signposting_header_links(["linkset", "api-catalog"]) if linksetlinks: linksetlink = linksetlinks[0] + print(linksetlinks) try: if linksetlink.get("url"): requestHelper = RequestHelper(linksetlink.get("url"), self.logger) requestHelper.setAcceptType(AcceptTypes.linkset) neg_source, linkset_data = requestHelper.content_negotiate("FsF-F1-02D") + print(requestHelper.request_url, requestHelper.content_type) if isinstance(linkset_data, dict): if isinstance(linkset_data.get("linkset"), list): validlinkset = None for candidatelinkset in linkset_data.get("linkset"): if isinstance(candidatelinkset, dict): - if candidatelinkset.get("anchor") in [self.pid_url, self.landing_url]: + # usual describedby etc links must refer via anchor to the landing page or pid + # but api-catalog may refer to another URL which represents an API link + if ( + candidatelinkset.get("anchor") in [self.pid_url, self.landing_url] + or linksetlink.get("rel") == "api-catalog" + ): validlinkset = candidatelinkset break if validlinkset: @@ -445,28 +458,36 @@ def set_signposting_linkset_links(self): self.logger.info("FsF-F2-01M : Found valid Signposting Linkset in provided JSON file") else: self.logger.warning( - "FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing oage or PID" + "FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing page or PID" ) + print(self.typed_links) else: validlinkset = False - parsed_links = self.parse_signposting_http_link_format(linkset_data.decode()) - try: - if parsed_links[0].get("anchor"): - self.logger.info("FsF-F2-01M : Found valid Signposting Linkset in provided text file") - for parsed_link in parsed_links: - if parsed_link.get("anchor") in [self.pid_url, self.landing_url]: - self.typed_links.append(parsed_link) - validlinkset = True - if not validlinkset: - self.logger.warning( - "FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing page or PID" - ) - except Exception as e: + if linkset_data: + parsed_links = self.parse_signposting_http_link_format(linkset_data.decode()) + try: + if parsed_links[0].get("anchor"): + self.logger.info("FsF-F2-01M : Found valid Signposting Linkset in provided text file") + for parsed_link in parsed_links: + if ( + parsed_link.get("anchor") in [self.pid_url, self.landing_url] + or linksetlink.get("rel") == "api-catalog" + ): + self.typed_links.append(parsed_link) + validlinkset = True + if not validlinkset: + self.logger.warning( + "FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing page or PID" + ) + except Exception as e: + self.logger.warning( + "FsF-F2-01M : Found Signposting Linkset but could not correctly parse the file" + ) + print(e) + else: self.logger.warning( "FsF-F2-01M : Found Signposting Linkset but could not correctly parse the file" ) - print(e) - except Exception as e: self.logger.warning("FsF-F2-01M : Failed to parse Signposting Linkset -: " + str(e)) @@ -553,8 +574,8 @@ def parse_signposting_http_link_format(self, signposting_link_format_text): rel_match = re.search(r'rel\s*=\s*\"?([^,;"]+)\"?', link_prop) elif link_prop.startswith("type"): type_match = re.search(r'type\s*=\s*\"?([^,;"]+)\"?', link_prop) - elif link_prop.startswith("formats"): - formats_match = re.search(r'formats\s*=\s*\"?([^,;"]+)\"?', link_prop) + elif link_prop.startswith("profile"): + formats_match = re.search(r'profile\s*=\s*\"?([^,;"]+)\"?', link_prop) if type_match: found_type = type_match[1] if rel_match: @@ -698,8 +719,6 @@ def retrieve_metadata_embedded(self): self.redirect_url = requestHelper.redirect_url response_status = requestHelper.response_status self.landing_page_status = response_status - # if requestHelper.response_content: - # self.landing_url = requestHelper.redirect_url else: self.logger.warning( "FsF-F2-01M :Skipping Embedded tests, no scheme/protocol detected to be able to resolve " @@ -761,8 +780,11 @@ def retrieve_metadata_embedded(self): extruct_metadata = self.retrieve_metadata_embedded_extruct() # if extruct_metadata: ext_meta = extruct_metadata.get("json-ld") + # comment the line below if jmespath handling of embedded json-ld is preferred, otherwise json-ls always will be handles as graph + ext_meta = json.dumps(ext_meta) + # print('EXT META',ext_meta) self.logger.info("FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata from html page") - + # TODO: actually schema.org, dcat and skos metadata is collected from a json-ld graph so this should be renamed schemaorg_collector_embedded = MetaDataCollectorRdf( loggerinst=self.logger, json_ld_content=ext_meta, source=MetadataSources.SCHEMAORG_EMBEDDED ) @@ -789,7 +811,7 @@ def retrieve_metadata_embedded(self): ) self.logger.log( self.LOG_SUCCESS, - "FsF-F2-01M : Found schema.org JSON-LD metadata in html page -: " + "FsF-F2-01M : Found embedded (schema.org) JSON-LD metadata in html page -: " + str(schemaorg_dict.keys()), ) else: diff --git a/fuji_server/helper/linked_vocab_helper.py b/fuji_server/helper/linked_vocab_helper.py index abab1582..ff0ca9e2 100644 --- a/fuji_server/helper/linked_vocab_helper.py +++ b/fuji_server/helper/linked_vocab_helper.py @@ -13,16 +13,19 @@ logger = logging.getLogger(__name__) -class linked_vocab_helper: +logger = logging.getLogger(__name__) + + +class LinkedVocabHelper: fuji_server_dir = Path(__file__).parent.parent # project_root linked_vocabs_dir = fuji_server_dir / "data/linked_vocabs" def __init__(self, linked_vocab_index={}): self.linked_vocab_index = linked_vocab_index self.linked_vocab_dict = {} - self.ignore_prefixes = ["orcid", "doi", "isni", "ror", "wikipedia"] + self.ignore_prefixes = ["orcid", "doi", "isni", "ror", "wikipedia", "github", "arxiv"] # prefixes used for identifiers only so we ignore these for terms - self.ignore_domain = ["orcid.org", "doi.org", "ror.org", "zenodo.org", "isni.org"] + self.ignore_domain = ["orcid.org", "doi.org", "ror.org", "zenodo.org", "isni.org", "github.com", "arxiv.org"] def set_linked_vocab_dict(self): logger.info("Setting up the vocab dict.........................") diff --git a/fuji_server/helper/metadata_collector.py b/fuji_server/helper/metadata_collector.py index 386677c5..e7272eab 100644 --- a/fuji_server/helper/metadata_collector.py +++ b/fuji_server/helper/metadata_collector.py @@ -8,7 +8,7 @@ from urlextract import URLExtract from fuji_server.helper import metadata_mapper -from fuji_server.helper.linked_vocab_helper import linked_vocab_helper +from fuji_server.helper.linked_vocab_helper import LinkedVocabHelper from fuji_server.helper.metadata_mapper import Mapper from fuji_server.helper.preprocessor import Preprocessor @@ -300,7 +300,7 @@ def setLinkedNamespaces(self, meta_source): """ extractor = URLExtract() found_urls = [] - lov_helper = linked_vocab_helper(Preprocessor.linked_vocab_index) + lov_helper = LinkedVocabHelper(Preprocessor.linked_vocab_index) if meta_source is not None: if isinstance(meta_source, str): found_urls = set(extractor.gen_urls(str(meta_source))) diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index 9ce6fedd..b755e996 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -4,6 +4,7 @@ import json import re +import urllib import idutils import jmespath @@ -150,20 +151,21 @@ def get_metadata_from_graph(self, rdf_response_graph): self.logger.info("FsF-F2-01M : Trying to identify namespaces in RDF Graph") graph_namespaces = self.set_namespaces(rdf_response_graph) # self.getNamespacesfromIRIs(graph_text) - # TODO: set credit score for being valid RDF - # TODO: since its valid RDF aka semantic representation, make sure FsF-I1-01M is passed and scored + schema_metadata, dcat_metadata, skos_metadata = {}, {}, {} if rdflib.term.URIRef("http://www.w3.org/ns/dcat#") in graph_namespaces.values(): self.logger.info("FsF-F2-01M : RDF Graph seems to contain DCAT metadata elements") - rdf_metadata = self.get_dcat_metadata(rdf_response_graph) - elif ( + dcat_metadata = self.get_dcat_metadata(rdf_response_graph) + if ( rdflib.term.URIRef("http://schema.org/") in graph_namespaces.values() or rdflib.term.URIRef("https://schema.org/") in graph_namespaces.values() ): self.logger.info("FsF-F2-01M : RDF Graph seems to contain schema.org metadata elements") - rdf_metadata = self.get_schemaorg_metadata_from_graph(rdf_response_graph) - elif bool(set(ontology_indicator) & set(graph_namespaces.values())): + schema_metadata = self.get_schemaorg_metadata_from_graph(rdf_response_graph) + if bool(set(ontology_indicator) & set(graph_namespaces.values())): self.logger.info("FsF-F2-01M : RDF Graph seems to contain SKOS/OWL metadata elements") - rdf_metadata = self.get_ontology_metadata(rdf_response_graph) + skos_metadata = self.get_ontology_metadata(rdf_response_graph) + # merging metadata dicts + rdf_metadata = skos_metadata | dcat_metadata | schema_metadata # else: if not rdf_metadata: self.logger.info( @@ -240,104 +242,118 @@ def parse_metadata(self): ): self.source_name = MetadataSources.SCHEMAORG_NEGOTIATED self.metadata_format = MetadataFormats.JSONLD - self.logger.info("FsF-F2-01M : Try to parse RDF (JSON-LD) from -: %s" % (jsonld_source_url)) - if isinstance(rdf_response, bytes): - try: - rdf_response = rdf_response.decode("utf-8") - except: - pass - if isinstance(rdf_response, dict) or isinstance(rdf_response, list): - self.logger.info( - "FsF-F2-01M : Try to parse JSON-LD using JMESPath retrieved as dict from -: %s" - % (jsonld_source_url) - ) - # in case two or more JSON-LD strings are embedded - if isinstance(rdf_response, list): - json_dict = None - if len(rdf_response) > 1: - self.logger.info( - "FsF-F2-01M : Found more than one JSON-LD embedded in landing page try to identify Dataset or CreativeWork type" - ) - for meta_rec in rdf_response: - meta_rec_type = str(meta_rec.get("@type")).lower().lstrip("schema:") - if meta_rec_type in ["dataset"]: - json_dict = meta_rec - break - if meta_rec_type in self.SCHEMA_ORG_CREATIVEWORKS: - json_dict = meta_rec - if not json_dict: - rdf_response_dict = rdf_response[0] - else: - rdf_response_dict = json_dict - else: - rdf_response_dict = rdf_response - try: - rdf_metadata = self.get_schemorg_metadata_from_dict(rdf_response_dict) - if rdf_metadata: - self.setLinkedNamespaces(str(rdf_response_dict)) - else: - self.logger.info( - "FsF-F2-01M : Could not identify schema.org JSON-LD metadata using JMESPath, continuing with RDF graph processing" - ) + if rdf_response: + self.logger.info("FsF-F2-01M : Try to parse RDF (JSON-LD) from -: %s" % (jsonld_source_url)) + if isinstance(rdf_response, bytes): + try: + rdf_response = rdf_response.decode("utf-8") + except: + pass + if isinstance(rdf_response, dict) or isinstance(rdf_response, list): + self.logger.info( + "FsF-F2-01M : Try to parse JSON-LD using JMESPath retrieved as dict from -: %s" + % (jsonld_source_url) + ) + # in case two or more JSON-LD strings are embedded + if isinstance(rdf_response, list): + json_dict = None + if len(rdf_response) > 1: + self.logger.info( + "FsF-F2-01M : Found more than one JSON-LD embedded in landing page try to identify Dataset or CreativeWork type" + ) + for meta_rec in rdf_response: + meta_rec_type = str(meta_rec.get("@type")).lower().lstrip("schema:") + if meta_rec_type in ["dataset"]: + json_dict = meta_rec + break + if meta_rec_type in self.SCHEMA_ORG_CREATIVEWORKS: + json_dict = meta_rec + if not json_dict: + rdf_response = rdf_response[0] + else: + rdf_response = json_dict + # else: + # rdf_response_dict = rdf_response + try: + # rdf_response_json = json.dumps(rdf_response_dict) + # rdf_metadata = self.get_schemorg_metadata_from_dict(rdf_response_dict) + # rdf_metadata = self.get_schemaorg_metadata_from_graph(rdf_response_json) + # if rdf_metadata: + # self.setLinkedNamespaces(str(rdf_response)) + # else: + # self.logger.info( + # "FsF-F2-01M : Could not identify schema.org JSON-LD metadata using JMESPath, continuing with RDF graph processing" + # ) # quick fix for https://github.com/RDFLib/rdflib/issues/1484 # needs to be done before dict is converted to string # print(rdf_response) - if isinstance(rdf_response, dict): - if rdf_response.get("@context"): - if rdf_response.get("@graph"): - try: - # drop duplicate context in graph - if isinstance(rdf_response.get("@graph"), list): - for grph in rdf_response.get("@graph"): - if grph.get("@context"): - del grph["@context"] - else: - if rdf_response.get("@graph").get("@context"): - del rdf_response["@graph"]["@context"] - except Exception: - print("Failed drop duplicate JSON-LD context in graph") - pass - # Fixing Dereferencing issues: https://github.com/json-ld/json-ld.org/issues/747 - if isinstance(rdf_response.get("@context"), list): - for ctxi, ctxt in enumerate(rdf_response.get("@context")): - if "schema.org" in ctxt: - rdf_response["@context"][ctxi] = ( - "https://schema.org/docs/jsonldcontext.json" - ) - if isinstance(rdf_response.get("@context"), str): - if "schema.org" in rdf_response.get("@context"): - rdf_response["@context"] = "https://schema.org/docs/jsonldcontext.json" - # expand graph - rdf_response = jsonld.expand(rdf_response) - # convert dict to json string again for RDF graph parsing - rdf_response = json.dumps(rdf_response) - except Exception as e: - print("RDF Collector Error: ", e) - pass - # try to make graph from JSON-LD string - if isinstance(rdf_response, str): - try: - rdf_response = str(rdf_response).encode("utf-8") - except: - self.logger.info("FsF-F2-01M : UTF-8 string conversion of JSON-LD failed") - pass - self.logger.info( - "FsF-F2-01M : Try to parse JSON-LD using RDFLib retrieved as string from -: %s" - % (jsonld_source_url) - ) - try: - jsonldgraph = rdflib.ConjunctiveGraph(identifier=self.resolved_url) - rdf_response_graph = jsonldgraph.parse( - data=rdf_response, format="json-ld", publicID=self.resolved_url + if 1 == 1: + if isinstance(rdf_response, dict): + if rdf_response.get("@context"): + if rdf_response.get("@graph"): + try: + # drop duplicate context in graph + if isinstance(rdf_response.get("@graph"), list): + for grph in rdf_response.get("@graph"): + if grph.get("@context"): + del grph["@context"] + else: + if rdf_response.get("@graph").get("@context"): + del rdf_response["@graph"]["@context"] + except Exception: + print("Failed drop duplicate JSON-LD context in graph") + pass + # Fixing Dereferencing issues: https://github.com/json-ld/json-ld.org/issues/747 + if isinstance(rdf_response.get("@context"), list): + for ctxi, ctxt in enumerate(rdf_response.get("@context")): + if "schema.org" in ctxt: + rdf_response["@context"][ctxi] = ( + "https://schema.org/docs/jsonldcontext.json" + ) + if isinstance(rdf_response.get("@context"), str): + if "schema.org" in rdf_response.get("@context"): + rdf_response["@context"] = "https://schema.org/docs/jsonldcontext.json" + # expand graph + rdf_response = jsonld.expand(rdf_response) + # convert dict to json string again for RDF graph parsing + rdf_response = json.dumps(rdf_response) + except Exception as e: + print("RDF Collector Error: ", e) + pass + # try to make graph from JSON-LD string + if isinstance(rdf_response, str) and rdf_response not in ["null", "None"]: + # url escape malformed (spaces) URIs + try: + suris = re.findall('"http[s]?:\/\/(.*?)"', rdf_response) + for suri in suris: + if " " in suri: + rsuri = urllib.parse.quote(suri) + rdf_response = rdf_response.replace(suri, rsuri) + except: + pass + # encoding + try: + rdf_response = str(rdf_response).encode("utf-8") + except: + self.logger.info("FsF-F2-01M : UTF-8 string conversion of JSON-LD failed") + pass + self.logger.info( + "FsF-F2-01M : Try to parse JSON-LD using RDFLib retrieved as string from -: %s" + % (jsonld_source_url) ) - # rdf_response_graph = jsonldgraph - self.setLinkedNamespaces(self.getAllURIS(jsonldgraph)) - except Exception as e: - print("JSON-LD parsing error", e, rdf_response[:100]) - self.logger.info(f"FsF-F2-01M : Parsing error (RDFLib), failed to extract JSON-LD -: {e}") + try: + jsonldgraph = rdflib.ConjunctiveGraph(identifier=self.resolved_url) + rdf_response_graph = jsonldgraph.parse( + data=rdf_response, format="json-ld", publicID=self.resolved_url + ) + + # rdf_response_graph = jsonldgraph + self.setLinkedNamespaces(self.getAllURIS(jsonldgraph)) + except Exception as e: + print("JSON-LD parsing error", e, rdf_response[:100]) + self.logger.info(f"FsF-F2-01M : Parsing error (RDFLib), failed to extract JSON-LD -: {e}") elif self.accept_type == AcceptTypes.rdf: - # print('ACCEPT: ',self.accept_type) # parse all other RDF formats (non JSON-LD schema.org) # parseformat = re.search(r'[\/+]([a-z0-9]+)$', str(requestHelper.content_type)) format_dict = { @@ -438,9 +454,9 @@ def get_sparqled_metadata(self, g): self.logger.info("FsF-F2-01M : Trying to query generic SPARQL on RDF, found triples: -:" + str(len(g))) r = g.query(Mapper.GENERIC_SPARQL.value) for row in r: - for relation_type, related_resource in row.asdict().items(): - if relation_type is not None: - if relation_type in [ + for row_property, row_value in row.asdict().items(): + if row_property is not None: + if row_property in [ "references", "source", "isVersionOf", @@ -457,11 +473,12 @@ def get_sparqled_metadata(self, g): if not meta.get("related_resources"): meta["related_resources"] = [] meta["related_resources"].append( - {"related_resource": str(related_resource), "relation_type": relation_type} + {"related_resource": str(row_value), "relation_type": row_property} ) else: - if related_resource: - meta[relation_type] = str(related_resource) + if row_value: + if not isinstance(row_value, rdflib.term.BNode): + meta[row_property] = str(row_value) if meta: break # break @@ -510,7 +527,7 @@ def get_sparqled_metadata(self, g): return meta # TODO rename to: get_core_metadata - def get_metadata(self, g, item, type="Dataset"): + def get_core_metadata(self, g, item, type="Dataset"): """Get the core (domain agnostic, DCAT, DC, schema.org) metadata given in RDF graph. Parameters @@ -545,7 +562,12 @@ def get_metadata(self, g, item, type="Dataset"): + list(g.objects(item, SMA.identifier)) + list(g.objects(item, SDO.sameAs)) + list(g.objects(item, SMA.sameAs)) + + list(g.objects(item, SMA.url)) + + list(g.objects(item, SDO.url)) ): + idvalue = g.value(identifier, SDO.value) or g.value(identifier, SMA.value) + if idvalue: + identifier = idvalue meta["object_identifier"].append(str(identifier)) if not meta.get("language"): meta["language"] = str( @@ -640,9 +662,12 @@ def get_metadata(self, g, item, type="Dataset"): meta["contributor"].append(str(contributor)) if not meta.get("license"): - meta["license"] = str( - g.value(item, DCTERMS.license) or g.value(item, SDO.license) or g.value(item, SMA.license) - ) + license_item = g.value(item, DCTERMS.license) or g.value(item, SDO.license) or g.value(item, SMA.license) + # schema.org + license_value = str(license_item) + if g.value(license_item, SDO.url) or g.value(license_item, SMA.url): + license_value = g.value(license_item, SDO.url) or g.value(license_item, SMA.url) + meta["license"] = str(license_value) if not meta.get("access_level"): meta["access_level"] = str( g.value(item, DCTERMS.accessRights) @@ -720,12 +745,12 @@ def get_ontology_metadata(self, graph): ontologies = list(graph[: RDF.type : OWL.Ontology]) if len(ontologies) > 0: self.logger.info("FsF-F2-01M : RDF Graph seems to represent a OWL Ontology") - ont_metadata = self.get_metadata(graph, ontologies[0], type="DefinedTermSet") + ont_metadata = self.get_core_metadata(graph, ontologies[0], type="DefinedTermSet") else: ontologies = list(graph[: RDF.type : SKOS.ConceptScheme]) or list(graph[: RDF.type : SKOS.Collection]) if len(ontologies) > 0: self.logger.info("FsF-F2-01M : RDF Graph seems to represent a SKOS Ontology") - ont_metadata = self.get_metadata(graph, ontologies[0], type="DefinedTermSet") + ont_metadata = self.get_core_metadata(graph, ontologies[0], type="DefinedTermSet") else: self.logger.info("FsF-F2-01M : Could not parse Ontology RDF") return ont_metadata @@ -863,12 +888,8 @@ def get_schemorg_metadata_from_dict(self, json_dict): self.logger.info("FsF-I3-01M : No related resource(s) found in Schema.org metadata") if jsnld_metadata.get("object_size"): - # print(jsnld_metadata.get('object_size')) if isinstance(jsnld_metadata["object_size"], dict): jsnld_metadata["object_size"] = str(jsnld_metadata["object_size"].get("value")) - - # jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText') - else: self.logger.info( "FsF-F2-01M : Found JSON-LD but record is not of type schema.org based on context -: " @@ -885,27 +906,27 @@ def get_schemorg_metadata_from_dict(self, json_dict): jsnld_metadata = {} return jsnld_metadata - def get_schemaorg_metadata_from_graph(self, graph): - # we will only test creative works and subtypes - creative_work_types = Preprocessor.get_schema_org_creativeworks() - creative_work = None - schema_metadata = {} - SMA = Namespace("http://schema.org/") - # use only schema.org properties and create graph using these. - # is e.g. important in case schema.org is encoded as RDFa and variuos namespaces are used - creative_work_type = "Dataset" + def find_root_candidates(self, graph, allowed_types=["Dataset"]): + allowed_types = [at.lower() for at in allowed_types if isinstance(at, str)] + cand_creative_work = {} + object_types_dict = {} try: - cand_creative_work = {} - object_types_dict = {} for root in rdflib.util.find_roots(graph, RDF.type): # we have https and http as allowed schema.org namespace protocols + if "schema.org" in str(root): root_name = str(root).rsplit("/")[-1].strip() - if root_name.lower() in creative_work_types: + elif "dcat" in str(root): + root_name = str(root).rsplit("#")[-1].strip() + else: + root_name = None + if root_name: + if root_name.lower() in allowed_types: creative_works = list(graph[: RDF.type : root]) - # print(root, type(creative_works[0]), list(graph.subjects(object=creative_works[0]))) # Finding the schema.org root creative_work_subjects = list(graph.subjects(object=creative_works[0])) + # don't list yourself... + creative_work_subjects = [crs for crs in creative_work_subjects if crs != creative_works[0]] if len(creative_work_subjects) == 0: cand_creative_work[root_name] = creative_works[0] if object_types_dict.get(str(creative_works[0])): @@ -916,7 +937,21 @@ def get_schemaorg_metadata_from_graph(self, graph): # helps for ro crate elif graph.identifier in creative_work_subjects: cand_creative_work[root_name] = creative_works[0] + except Exception as ee: + print("ROOT IDENTIFICATION ERROR: ", ee) + return cand_creative_work, object_types_dict + def get_schemaorg_metadata_from_graph(self, graph): + # we will only test creative works and subtypes + creative_work_types = Preprocessor.get_schema_org_creativeworks() + creative_work = None + schema_metadata = {} + SMA = Namespace("http://schema.org/") + # use only schema.org properties and create graph using these. + # is e.g. important in case schema.org is encoded as RDFa and variuos namespaces are used + creative_work_type = "Dataset" + try: + cand_creative_work, object_types_dict = self.find_root_candidates(graph, creative_work_types) if cand_creative_work: # prioritize Dataset type if "Dataset" in cand_creative_work: @@ -927,11 +962,26 @@ def get_schemaorg_metadata_from_graph(self, graph): except Exception as e: self.logger.info("FsF-F2-01M : Schema.org RDF graph parsing failed -: " + str(e)) + print("Cand Creative work identification Error", e) if creative_work: - schema_metadata = self.get_metadata(graph, creative_work, type=creative_work_type) + schema_metadata = self.get_core_metadata(graph, creative_work, type=creative_work_type) # object type (in case there are more than one if isinstance(object_types_dict.get(str(creative_work)), list): schema_metadata["object_type"] = object_types_dict.get(str(creative_work)) + # "access_free" + access_free = graph.value(creative_work, SMA.isAccessibleForFree) or graph.value( + creative_work, SDO.isAccessibleForFree + ) + if access_free: + schema_metadata["access_free"] = access_free + # object size (total) + + object_size = graph.value(creative_work, SMA.size) or graph.value(creative_work, SDO.size) + if object_size: + size_value = graph.value(object_size, SMA.value) or graph.value(object_size, SDO.value) + if not size_value: + size_value = object_size + schema_metadata["object_size"] = size_value # creator creator_node = None if graph.value(creative_work, SMA.creator): @@ -955,10 +1005,16 @@ def get_schemaorg_metadata_from_graph(self, graph): ) if len(creator_name) > 0: schema_metadata["creator"] = creator_name - - distribution = graph.objects(creative_work, SMA.distribution) or graph.objects( - creative_work, SDO.distribution + distribution = list(graph.objects(creative_work, SMA.distribution)) + list( + graph.objects(creative_work, SDO.distribution) ) + # distribution as hasPart which actually are MediaObjects + for haspart in list(graph.objects(creative_work, SMA.hasPart)) + list( + graph.objects(creative_work, SDO.hasPart) + ): + if "MediaObject" in str(graph.value(haspart, RDF.type)): + distribution.append(haspart) + schema_metadata["object_content_identifier"] = [] for dist in distribution: durl = ( @@ -967,6 +1023,9 @@ def get_schemaorg_metadata_from_graph(self, graph): or graph.value(dist, SDO.contentUrl) or graph.value(dist, SDO.url) ) + if not durl: + if isinstance(dist, rdflib.term.URIRef): + durl = str(dist) dtype = graph.value(dist, SMA.encodingFormat) or graph.value(dist, SDO.encodingFormat) dsize = graph.value(dist, SMA.contentSize) or graph.value(dist, SDO.contentSize) if durl or dtype or dsize: @@ -977,10 +1036,9 @@ def get_schemaorg_metadata_from_graph(self, graph): {"url": str(durl), "type": dtype, "size": str(dsize)} ) - potential_action = graph.objects(creative_work, SMA.potentialAction) or graph.objects( - creative_work, SDO.potentialAction + potential_action = list(graph.objects(creative_work, SMA.potentialAction)) + list( + graph.objects(creative_work, SDO.potentialAction) ) - schema_metadata["object_content_service"] = [] for potaction in potential_action: service_url, service_desc, service_type = None, None, None @@ -997,22 +1055,55 @@ def get_schemaorg_metadata_from_graph(self, graph): entry_point, SDO.additionalType ) if service_url: - schema_metadata["object_content_service"].append( - {"url": service_url, "type": service_type, "desc": service_desc} + schema_metadata["object_content_identifier"].append( + {"url": service_url, "type": service_type, "service": service_desc} ) schema_metadata["measured_variable"] = [] - for variable in list(graph.objects(creative_works[0], SMA.variableMeasured)) or list( - graph.objects(creative_works[0], SDO.variableMeasured) + for variable in list(graph.objects(creative_work, SMA.variableMeasured)) + list( + graph.objects(creative_work, SDO.variableMeasured) ): - variablename = graph.value(variable, SMA.name) or graph.value(variable, SDO.name) + variablename = graph.value(variable, SMA.name) or graph.value(variable, SDO.name) or None + if variablename: schema_metadata["measured_variable"].append(variablename) else: schema_metadata["measured_variable"].append(variable) - #'measured_variable: variableMeasured[*].name || variableMeasured , object_size: size,' \ + # two routes to API services provided by repositories + # 1) via the schema.org/DataCatalog 'offers' property + # 2) via the schema.org/Project 'hasofferCatalog' property + offer_catalog = graph.value(creative_work, SMA.hasOfferCatalog) or graph.value( + creative_work, SDO.hasOfferCatalog + ) + + data_services = list(graph.objects(creative_work, SMA.offers)) + list( + graph.objects(creative_work, SDO.offers) + ) + + if offer_catalog: + data_services.extend( + list(graph.objects(offer_catalog, SMA.itemListElement)) + + list(graph.objects(offer_catalog, SDO.itemListElement)) + ) + schema_metadata["metadata_service"] = [] + for data_service in data_services: + if offer_catalog: + service_rdf_type = graph.value(data_service, RDF.type) + service_offer = data_service + else: + service_offer = graph.value(data_service, SMA.itemOffered) or graph.value( + data_service, SDO.itemOffered + ) + service_rdf_type = graph.value(service_offer, RDF.type) + + if "WebAPI" in str(service_rdf_type) or "Service" in str(service_rdf_type): + service_url = graph.value(service_offer, SMA.url) or graph.value(service_offer, SDO.url) + service_type = graph.value(service_offer, SMA.documentation) or graph.value( + service_offer, SDO.documentation + ) + schema_metadata["metadata_service"].append({"url": str(service_url), "type": str(service_type)}) return schema_metadata def get_dcat_metadata(self, graph): @@ -1031,14 +1122,22 @@ def get_dcat_metadata(self, graph): dcat_metadata = dict() DCAT = Namespace("http://www.w3.org/ns/dcat#") CSVW = Namespace("http://www.w3.org/ns/csvw#") - - datasets = list(graph[: RDF.type : DCAT.Dataset]) + dcat_root_type = "Dataset" + datasets = [] + cand_roots, object_types_dict = self.find_root_candidates(graph, ["Dataset", "Catalog"]) + print("CAND ROOTS DCAT: ", cand_roots, object_types_dict) + if cand_roots: + # prioritize Dataset type + if "Dataset" not in cand_roots: + dcat_root_type = next(iter(cand_roots)) + if dcat_root_type: + datasets = list(graph[: RDF.type : DCAT[dcat_root_type]]) table = list(graph[: RDF.type : CSVW.Column]) # print("TABLE", len(table)) if len(datasets) > 1: self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one") if len(datasets) > 0: - dcat_metadata = self.get_metadata(graph, datasets[0], type="Dataset") + dcat_metadata = self.get_core_metadata(graph, datasets[0], type="Dataset") # distribution distribution = graph.objects(datasets[0], DCAT.distribution) # do something (check for table headers) with the table here.. @@ -1046,7 +1145,7 @@ def get_dcat_metadata(self, graph): print(t) dcat_metadata["object_content_identifier"] = [] for dist in distribution: - dtype, durl, dsize = None, None, None + dtype, durl, dsize, dservice = None, None, None, None if not ( graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL) @@ -1066,7 +1165,9 @@ def get_dcat_metadata(self, graph): extdist[0], DCAT.downloadURL ) dsize = distgraph.value(extdist[0], DCAT.byteSize) - dtype = distgraph.value(extdist[0], DCAT.mediaType) + dtype = distgraph.value(extdist[0], DCAT.mediaType) or distgraph.value( + extdist[0], DC.format + ) self.logger.info( "FsF-F2-01M : Found DCAT distribution URL info from remote location -:" + str(durl) ) @@ -1077,15 +1178,10 @@ def get_dcat_metadata(self, graph): # print(e) durl = str(dist) elif graph.value(dist, DCAT.accessService): - if not dcat_metadata["object_content_service"]: - dcat_metadata["object_content_service"] = [] for dcat_service in graph.objects(dist, DCAT.accessService): - service_url = graph.value(dcat_service, DCAT.endpointURL) - service_type = graph.value(dcat_service, DCTERMS.conformsTo) - servive_desc = graph.value(dcat_service, DCAT.endpointDescription) - dcat_metadata["object_content_service"].append( - {"url": service_url, "type": service_type, "desc": servive_desc} - ) + durl = graph.value(dcat_service, DCAT.endpointURL) + dtype = graph.value(dcat_service, DCTERMS.conformsTo) + dservice = graph.value(dcat_service, DCAT.endpointDescription) else: durl = graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL) # taking only one just to check if licence is available and not yet set @@ -1102,7 +1198,7 @@ def get_dcat_metadata(self, graph): if idutils.is_url(str(durl)): dtype = "/".join(str(dtype).split("/")[-2:]) dcat_metadata["object_content_identifier"].append( - {"url": str(durl), "type": dtype, "size": str(dsize)} + {"url": str(durl), "type": dtype, "size": str(dsize), "service": str(dservice)} ) if dcat_metadata["object_content_identifier"]: @@ -1110,15 +1206,14 @@ def get_dcat_metadata(self, graph): "FsF-F3-01M : Found data links in DCAT.org metadata -: " + str(dcat_metadata["object_content_identifier"]) ) - # TODO: add provenance metadata retrieval - # else: - # self.logger.info('FsF-F2-01M : Found DCAT content but could not correctly parse metadata') - # in order to keep DCAT in the found metadata list, we need to pass at least one metadata value.. - # dcat_metadata['object_type'] = 'Dataset' + # metadata services + data_services = graph.objects(datasets[0], DCAT.service) + dcat_metadata["metadata_service"] = [] + for data_service in data_services: + service_url = graph.value(data_service, DCAT.endpointURL) + service_type = graph.value(data_service, DCTERMS.conformsTo) + dcat_metadata["metadata_service"].append({"url": str(service_url), "type": str(service_type)}) return dcat_metadata - # rdf_meta.query(self.metadata_mapping.value) - # print(rdf_meta) - # return None def get_content_type(self): """Get the content type. diff --git a/fuji_server/helper/metadata_collector_xml.py b/fuji_server/helper/metadata_collector_xml.py index bcbf850d..5aeba72a 100644 --- a/fuji_server/helper/metadata_collector_xml.py +++ b/fuji_server/helper/metadata_collector_xml.py @@ -250,6 +250,49 @@ def parse_metadata(self): self.logger.info("FsF-F2-01M : Could not identify metadata properties in XML") return source_name, xml_metadata + def get_tree_property_list(self, propcontent): + res = [] + if isinstance(propcontent, list): + if len(propcontent) == 1: + if propcontent[0].get("attribute"): + res = propcontent[0].get("tree").attrib.get(propcontent[0].get("attribute")) + elif len(propcontent[0].get("tree")) == 0: + res = propcontent[0].get("tree").text + else: + res = lxml.etree.tostring(propcontent[0].get("tree"), method="text", encoding="unicode") + res = re.sub(r"\s+", " ", res) + res = res.strip() + res = [res] + else: + for propelem in propcontent: + if propelem.get("attribute"): + res.append(propelem.get("tree").attrib.get(propelem.get("attribute"))) + elif len(propelem.get("tree")) == 0: + res.append(propelem.get("tree").text) + else: + resprop = lxml.etree.tostring(propelem.get("tree"), method="text", encoding="unicode") + resprop = re.sub(r"\s+", " ", resprop) + resprop = resprop.strip() + res.append(resprop) + return res + + def path_query(self, mappath, tree): + pathdef = mappath.split("@@") + attribute = None + if len(pathdef) > 1: + attribute = pathdef[1] + if ":" in attribute: + if attribute.split(":")[0] == "xlink": + attribute = "{http://www.w3.org/1999/xlink}" + attribute.split(":")[1] + elif attribute.split(":")[0] == "xml": + attribute = "{http://www.w3.org/XML/1998/namespace}" + attribute.split(":")[1] + try: + subtrees = tree.findall(pathdef[0]) + except Exception as e: + subtrees = [] + print("XML XPATH error ", str(e), str(pathdef[0])) + return subtrees, attribute + def get_mapped_xml_metadata(self, tree, mapping): """Get the mapped XML metadata. @@ -278,45 +321,37 @@ def get_mapped_xml_metadata(self, tree, mapping): pathlist = [mapping.get(prop).get("path")] propcontent = [] + path_no = 0 for mappath in pathlist: - pathdef = mappath.split("@@") - attribute = None - if len(pathdef) > 1: - attribute = pathdef[1] - if ":" in attribute: - if attribute.split(":")[0] == "xlink": - attribute = "{http://www.w3.org/1999/xlink}" + attribute.split(":")[1] - elif attribute.split(":")[0] == "xml": - attribute = "{http://www.w3.org/XML/1998/namespace}" + attribute.split(":")[1] - try: - subtrees = tree.findall(pathdef[0]) - except Exception as e: - subtrees = [] - print("XML XPATH error ", str(e), str(pathdef[0])) + subtrees, attribute = self.path_query(mappath, tree) for subtree in subtrees: - propcontent.append({"tree": subtree, "attribute": attribute}) - # propcontent.extend({'tree':tree.findall(pathdef[0]),'attribute':attribute}) - if isinstance(propcontent, list): - if len(propcontent) == 1: - if propcontent[0].get("attribute"): - res[prop] = propcontent[0].get("tree").attrib.get(propcontent[0].get("attribute")) - elif len(propcontent[0].get("tree")) == 0: - res[prop] = propcontent[0].get("tree").text - else: - res[prop] = lxml.etree.tostring(propcontent[0].get("tree"), method="text", encoding="unicode") - res[prop] = re.sub(r"\s+", " ", res[prop]) - res[prop] = res[prop].strip() - else: - for propelem in propcontent: - if propelem.get("attribute"): - res[prop].append(propelem.get("tree").attrib.get(propelem.get("attribute"))) - elif len(propelem.get("tree")) == 0: - res[prop].append(propelem.get("tree").text) + if mapping.get(prop).get("subpath"): + subpathdict = mapping.get(prop).get("subpath") + if isinstance(subpathdict, list): + if len(subpathdict) > path_no: + subpathdict = subpathdict[path_no] + else: + subpathdict = subpathdict[0] else: - resprop = lxml.etree.tostring(propelem.get("tree"), method="text", encoding="unicode") - resprop = re.sub(r"\s+", " ", resprop) - resprop = resprop.strip() - res[prop].append(resprop) + subpathdict = subpathdict + for subprop, subpath in subpathdict.items(): + if not res.get(prop + "_" + subprop): + res[prop + "_" + subprop] = [] + subsubtrees, subattribute = self.path_query(subpath, subtree) + if not subsubtrees: + subsubtrees = [lxml.etree.Element("none")] + subattribute = None + # print(prop+'_'+subprop,subsubtrees[0], ' -#- ',lxml.etree.tostring(subsubtrees[0], method="text", encoding="unicode"),' -#- ', subattribute) + subpropcontent = [{"tree": subsubtrees[0], "attribute": subattribute}] + if subpropcontent: + # print('SUBPROP: ',subprop, self.get_tree_property_list(subpropcontent)) + res[prop + "_" + subprop].extend(self.get_tree_property_list(subpropcontent)) + # print(res) + else: + propcontent.append({"tree": subtree, "attribute": attribute}) + if propcontent: + res[prop] = self.get_tree_property_list(propcontent) + path_no += 1 # related resources for kres, vres in res.items(): @@ -338,70 +373,6 @@ def get_mapped_xml_metadata(self, tree, mapping): res["related_resources"].append({"related_resource": relres, "resource_type": reltype}) ri += 1 # object_content_identifiers - """ - # The code below would theoretically also consider information which does not include a content identifier but only sie or type of content - res['object_content_identifier'] = [] - if res.get('object_content_identifier_url'): - #if not isinstance(res.get('object_content_identifier_url'), list): - # res['object_content_identifier_url'] = [res.get('object_content_identifier_url')] - if not isinstance(res.get('object_content_identifier_size'), list): - res['object_content_identifier_size'] = [res.get('object_content_identifier_size')] - if not isinstance(res.get('object_content_identifier_type'), list): - res['object_content_identifier_type'] = [res.get('object_content_identifier_type')] - - object_content_count = max(len(res.get('object_content_identifier_url') or []), - len(res.get('object_content_identifier_type') or []), - len(res.get('object_content_identifier_size') or [])) - - for content_index in range(object_content_count): - try: - content_url = res['object_content_identifier_url'][content_index] - except: - content_url = None - try: - content_size = res['object_content_identifier_size'][content_index] - except: - content_size = None - try: - content_type = res['object_content_identifier_type'][content_index] - except: - content_type = None - res['object_content_identifier'].append({ - 'url': content_url, - 'size': content_size, - 'type': content_type - }) - res.pop('object_content_identifier_type', None) - res.pop('object_content_identifier_size', None) - res.pop('object_content_identifier_url', None) - """ - if res.get("object_content_service_url"): - res["object_content_service"] = [] - if not isinstance(res["object_content_service_url"], list): - res["object_content_service_url"] = [res["object_content_service_url"]] - si = 0 - for service_url in res["object_content_identifier_url"]: - service_desc = None - service_type = None - if res.get("object_content_service_type"): - if si < len(res["object_content_service_type"]): - service_type = res["object_content_service_type"][si] - if res.get("object_content_service_desc"): - if si < len(res["object_content_service_desc"]): - service_desc = res["object_content_service_desc"][si] - if ( - service_type - and "WWW:LINK" not in str(service_type) - and "www.w3.org/TR/xlink" not in str(service_type) - ): - res["object_content_service"].append( - {"url": service_url, "desc": service_desc, "type": service_type} - ) - si += 1 - res.pop("object_content_service_url", None) - res.pop("object_content_service_type", None) - res.pop("object_content_service_desc", None) - if res.get("object_content_identifier_url"): res["object_content_identifier"] = [] if not isinstance(res["object_content_identifier_url"], list): @@ -410,17 +381,25 @@ def get_mapped_xml_metadata(self, tree, mapping): for content_url in res["object_content_identifier_url"]: content_size = None content_type = None + content_service = None if res.get("object_content_identifier_size"): if ci < len(res["object_content_identifier_size"]): content_size = res["object_content_identifier_size"][ci] if res.get("object_content_identifier_type"): if ci < len(res["object_content_identifier_type"]): content_type = res["object_content_identifier_type"][ci] + if res.get("object_content_identifier_service"): + if ci < len(res["object_content_identifier_service"]): + if "WWW:LINK" not in str( + res["object_content_identifier_service"][ci] + ) and "www.w3.org/TR/xlink" not in str(res["object_content_identifier_service"][ci]): + content_service = res["object_content_identifier_service"][ci] res["object_content_identifier"].append( - {"url": content_url, "size": content_size, "type": content_type} + {"url": content_url, "size": content_size, "type": content_type, "service": content_service} ) ci += 1 res.pop("object_content_identifier_type", None) res.pop("object_content_identifier_size", None) res.pop("object_content_identifier_url", None) + res.pop("object_content_identifier_service", None) return res diff --git a/fuji_server/helper/metadata_mapper.py b/fuji_server/helper/metadata_mapper.py index c51cae89..c22be5d5 100644 --- a/fuji_server/helper/metadata_mapper.py +++ b/fuji_server/helper/metadata_mapper.py @@ -61,6 +61,7 @@ def flip_dict(dict_to_flip): "object_size": {"label": "Object Size", "sameAs": "http://purl.org/dc/terms/extent"}, "language": {"label": "Language", "sameAs": "http://purl.org/dc/terms/language"}, "license_path": {"label": "License Path", "sameAs": None}, + "metadata_service": {"label": "Metadata Service", "sameAs": None}, } # core metadata elements (FsF-F2-01M) @@ -192,6 +193,7 @@ def flip_dict(dict_to_flip): # Schema.org # conditionsOfAccess, usageInfo?, isAccessibleForFree ## A license document that applies to this content, typically indicated by URL. + ## actually this mapping is now deprecated and replaced by RDF collector SCHEMAORG_MAPPING = ( '{title: name[*]."@value" || name || headline[*]."@value" || headline, object_type: "@type", ' 'publication_date: datePublished."@value" || datePublished || dateCreated, ' @@ -203,7 +205,7 @@ def flip_dict(dict_to_flip): "right_holder: copyrightHolder[*].name || copyrightHolder[*].familyName, " "publisher: [publisher.url || provider.url, publisher.name || provider.name || publisher || provider], " 'license: license."@id" || license[?"@type" ==\'CreativeWork\'].id || license[?"@type" ==\'CreativeWork\'].url || license[?"@type" ==\'CreativeWork\'].name || license, ' - "summary: description, keywords: keywords, " + "summary: description || abstract, keywords: keywords, " 'object_identifier: [((identifier.value || identifier[*].value || identifier || "@id") || (url || url."@id")) , ' '(sameAs."@id" || sameAs[0]."@id" || sameAs.url || sameAs[0].url || sameAs)][], ' "access_level: conditionsOfAccess, " @@ -216,8 +218,9 @@ def flip_dict(dict_to_flip): '{related_resource: (isBasedOn."@id" || isBasedOn[0]."@id" || isBasedOn.url || isBasedOn[0].url || isBasedOn) , relation_type: \'isBasedOn\'} , ' '{related_resource: "@reverse".isBasedOn[0]."@id" || "@reverse".isBasedOn."@id" || "@reverse".isBasedOn[0].url || isBasedOn , relation_type: \'isBasisFor\'},' '{related_resource: (citation."@id" || citation[0]."@id" || citation.url || citation[0].url || citation.name || citation[0].name || citation), relation_type:\'references\'} ], ' - "object_content_identifier: (distribution[*].{url: (contentUrl || url), type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion} || [distribution.{url: (contentUrl || url), type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion}])," - "object_content_service: (potentialAction[*].{url: (target || url), type: target.additionalType, desc: target.urlTemplate} )," + "object_content_identifier: (distribution[*].{url: (contentUrl || url), type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion} || " + "[distribution.{url: (contentUrl || url), type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion}] || " + "potentialAction[*].{url: (target || url), type: target.additionalType, service: target.urlTemplate})," "language: inLanguage.name || inLanguage.alternateName || inLanguage}" ) # 'related_resources: [{related_resource: isPartOf, relation_type: \'isPartOf\'}, {related_resource: isBasedOn, relation_type: \'isBasedOn\'}], ' \ @@ -312,6 +315,7 @@ def flip_dict(dict_to_flip): ################# XML Mappings ############### # relations: indicate type using: related_resource_[opional relation type] alternative: define a list 'related_resource_type' # content identifiers: object_content_identifier_url, object_content_identifier_size, object_content_identifier_type (should have same length) + # otherwise take a look at the ISO/GCMD mapping # attributes: must be indicated like this: tag@@attribute XML_MAPPING_DUBLIN_CORE = { @@ -634,26 +638,43 @@ def flip_dict(dict_to_flip): "./{*}identificationInfo//{*}spatialRepresentationType/{*}MD_SpatialRepresentationTypeCode", ] }, + "object_content_identifier": { + "path": [ + "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource", + "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource", + ], + "subpath": [ + { + "url": "{*}linkage/{*}URL", + # https: // wiki.esipfed.org / Documenting_Online_Resources in INSPIRE compatible records this looks different and a controlled vocab is used + "type": "{*}applicationProfile/{*}Anchor@@xlink:href", + "service": "{*}protocol/{*}Anchor@@xlink:href", + }, + { + "url": "{*}linkage/{*}URL", + "type": "{*}applicationProfile/{*}Anchor@@xlink:href", + "service": "{*}protocol/{*}Anchor@@xlink:href", + }, + ], + }, + """ "object_content_identifier_url": { "path": [ - "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}linkage/{*}URL", "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}linkage/{*}URL", + #"./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource[{*}protocol]/{*}linkage/{*}URL", + "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}linkage/{*}URL" ] }, "object_content_identifier_type": { "path": [ "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}applicationProfile/{*}Anchor", - "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}applicationProfile/{*}Anchor", - ] - }, - "object_content_service_url": { - "path": [ - "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource[{*}protocol]/{*}linkage/{*}URL", + "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}applicationProfile/{*}Anchor" ] }, - "object_content_service_type": { + "object_content_identifier_service": { "path": "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}protocol/{*}Anchor@@xlink:href" }, + """ "measured_variable": { "path": [ "./{*}contentInfo/{*}MD_CoverageDescription/{*}attributeDescription/{*}RecordType", diff --git a/fuji_server/helper/preprocessor.py b/fuji_server/helper/preprocessor.py index 53e476d4..f51e9f79 100644 --- a/fuji_server/helper/preprocessor.py +++ b/fuji_server/helper/preprocessor.py @@ -11,7 +11,7 @@ import requests import yaml -from fuji_server.helper.linked_vocab_helper import linked_vocab_helper +from fuji_server.helper.linked_vocab_helper import LinkedVocabHelper class Preprocessor: @@ -412,7 +412,7 @@ def get_linked_vocab_index(cls): @classmethod def retrieve_linked_vocab_index(cls): - lov_helper = linked_vocab_helper() + lov_helper = LinkedVocabHelper() lov_helper.set_linked_vocab_index() cls.linked_vocab_index = lov_helper.linked_vocab_index diff --git a/fuji_server/helper/request_helper.py b/fuji_server/helper/request_helper.py index f67ee4b2..ec1b1045 100644 --- a/fuji_server/helper/request_helper.py +++ b/fuji_server/helper/request_helper.py @@ -62,6 +62,8 @@ def list(): class RequestHelper: def __init__(self, url, logInst: object = None): + self.user_agent = "F-UJI" + self.browser_like_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; F-UJI)" self.checked_content = {} if logInst: self.logger = logInst @@ -145,7 +147,7 @@ def content_negotiate(self, metric_id="", ignore_html=True): redirect_handler, ) urllib.request.install_opener(opener) - request_headers = {"Accept": self.accept_type, "User-Agent": "F-UJI"} + request_headers = {"Accept": self.accept_type, "User-Agent": self.user_agent} if self.authtoken: request_headers["Authorization"] = self.tokentype + " " + self.authtoken tp_request = urllib.request.Request(self.request_url, headers=request_headers) @@ -167,6 +169,17 @@ def content_negotiate(self, metric_id="", ignore_html=True): "%s : F-UJI 308 redirect failed, most likely this patch: https://github.com/python/cpython/pull/19588/commits is not installed" % metric_id ) + elif e.code == 405: + self.logger.error( + "%s : Received a 405 HTTP error, most likely because the host denied the User-Agent (web scraping detection), retrying..." + % metric_id + ) + try: + request_headers["User-Agent"] = self.browser_like_user_agent + tp_request = urllib.request.Request(self.request_url, headers=request_headers) + tp_response = opener.open(tp_request, timeout=10) + except: + print("405 fix error:" + str(e)) elif e.code >= 500: if "doi.org" in self.request_url: self.logger.error( diff --git a/fuji_server/models/fair_results.py b/fuji_server/models/fair_results.py index a7744473..18105ee8 100644 --- a/fuji_server/models/fair_results.py +++ b/fuji_server/models/fair_results.py @@ -9,6 +9,7 @@ from fuji_server import util from fuji_server.models.any_of_fair_results_results_items import AnyOfFAIRResultsResultsItems from fuji_server.models.base_model_ import Model +from fuji_server.models.harvest_results_metadata import HarvestResultsMetadata class FAIRResults(Model): @@ -31,6 +32,7 @@ def __init__( total_metrics: int | None = None, summary: dict | None = None, results: list[AnyOfFAIRResultsResultsItems] | None = None, + harvested_metadata: list[HarvestResultsMetadata] | None = None, ): """FAIRResults - a model defined in Swagger @@ -58,6 +60,8 @@ def __init__( :type summary: Dict :param results: The results of this FAIRResults. # noqa: E501 :type results: List[AnyOfFAIRResultsResultsItems] + :param harvested_metadata: The harvested_metadata of this FAIRResults. # noqa: E501 + :type harvested_metadata: List[HarvestResultsMetadata] """ self.swagger_types = { "test_id": str, @@ -72,6 +76,7 @@ def __init__( "total_metrics": int, "summary": dict, "results": list[AnyOfFAIRResultsResultsItems], + "harvested_metadata": list[HarvestResultsMetadata], } self.attribute_map = { @@ -87,6 +92,7 @@ def __init__( "total_metrics": "total_metrics", "summary": "summary", "results": "results", + "harvested_metadata": "harvested_metadata", } self._test_id = test_id self._request = request @@ -100,6 +106,7 @@ def __init__( self._total_metrics = total_metrics self._summary = summary self._results = results + self._harvested_metadata = harvested_metadata @classmethod def from_dict(cls, dikt) -> "FAIRResults": @@ -363,3 +370,24 @@ def results(self, results: list[AnyOfFAIRResultsResultsItems]): """ self._results = results + + @property + def harvested_metadata(self) -> list[HarvestResultsMetadata]: + """Gets the harvested_metadata of this FAIRResults. + + + :return: The harvested_metadata of this FAIRResults. + :rtype: List[HarvestResultsMetadata] + """ + return self._harvested_metadata + + @harvested_metadata.setter + def harvested_metadata(self, harvested_metadata: list[HarvestResultsMetadata]): + """Sets the harvested_metadata of this FAIRResults. + + + :param harvested_metadata: The harvested_metadata of this FAIRResults. + :type harvested_metadata: List[HarvestResultsMetadata] + """ + + self._harvested_metadata = harvested_metadata diff --git a/fuji_server/yaml/metrics_v0.5.yaml b/fuji_server/yaml/metrics_v0.5.yaml index d37d87ed..d14751cb 100644 --- a/fuji_server/yaml/metrics_v0.5.yaml +++ b/fuji_server/yaml/metrics_v0.5.yaml @@ -67,7 +67,7 @@ metrics: - target: https://f-uji.net/vocab/identifier/persistent tested_on: https://f-uji.net/vocab/metadata/property/object_identifier comment: identifier has to resolve to a valid URI - metric_test_score: 1 + metric_test_score: 0.5 metric_test_maturity: 3 created_by: FAIRsFAIR date_created: 2020-07-08 @@ -82,12 +82,7 @@ metrics: description: Metadata is descriptive information about a data object. Since the metadata required differs depending on the users and their applications, this metric focuses on core metadata. The core metadata is the minimum descriptive information required to enable data finding, including citation which makes it easier to find data. We determine the required metadata based on common data citation guidelines (e.g., DataCite, ESIP, and IASSIST), and metadata recommendations for data discovery (e.g., EOSC Datasets Minimum Information (EDMI), DataCite Metadata Schema, W3C Recommendation Data on the Web Best Practices and Data Catalog Vocabulary). This metric focuses on domain-agnostic core metadata. Domain or discipline-specific metadata specifications are covered under metric FsF-R1.3-01M. A repository should adopt a schema that includes properties of core metadata, whereas data authors should take the responsibility of providing core metadata. fair_principle: F2 target: Metadata - evaluation_mechanism: >- - Metadata can be offered in different ways. here we focus on common web based strategies. - These include 1) embedding metadata within the landing page such as JSON-LD, OpenGraph, Microdata, Dublin Core, - 2) offering typed links which lead to metadata within the HTML code of the metadata or signposting links. - 3) enable content negotiation and deliver e.g. RDF, JSON-LD or XML on demand. - The metric evaluates the completeness of metadata in case metadata has been retrieved. + evaluation_mechanism: Metadata can be offered in different ways. here we focus on common web based strategies. These include 1) embedding metadata within the landing page such as JSON-LD, OpenGraph, Microdata, Dublin Core, 2) offering typed links which lead to metadata within the HTML code of the metadata or signposting links. 3) enable content negotiation and deliver e.g. RDF, JSON-LD or XML on demand. The metric evaluates the completeness of metadata in case metadata has been retrieved. test_scoring_mechanism: cumulative metric_tests: - metric_test_identifier: FsF-F2-01M-1 @@ -154,10 +149,11 @@ metrics: metric_test_requirements: - target: https://f-uji.net/vocab/data/property tested_on: https://f-uji.net/vocab/metadata/property/object_content_identifier - modality: all + modality: any required: - type - size + - name - metric_test_identifier: FsF-F3-01M-2 metric_test_name: Metadata contains a PID or URL which indicates the location of the downloadable data content metric_test_score: 0.5 @@ -200,7 +196,7 @@ metrics: name: - rdfa - microdata - - meta_tags + - meta_tag - json_in_html - metric_test_identifier: FsF-F4-01M-2 metric_test_name: Metadata is registered in major research data registries (DataCite) @@ -293,24 +289,24 @@ metrics: version: 0.5 total_score: 1 -- metric_identifier: FsF-A2-01M - metric_number: 9 - metric_short_name: Metadata Preservation - metric_name: Metadata remains available, even if the data is no longer available. - description: This metric determines if the metadata will be preserved even when the data they represent are no longer available, replaced or lost. - fair_principle: A2 - target: Metadata - evaluation_mechanism: Currently this metric can only be assessed using the persistent identifier as an indicator. DOI metadata is preserved by DataCite. - metric_tests: - - metric_test_identifier: FsF-A2-01M-1 - metric_test_name: The persistent identifier system used guarantees the preservation of associated metadata - metric_test_score: 1 - metric_test_maturity: 3 - created_by: FAIRsFAIR - date_created: 2020-07-08 - date_updated: 2020-12-05 - version: 0.5 - total_score: 1 +#- metric_identifier: FsF-A2-01M +# metric_number: 9 +# metric_short_name: Metadata Preservation +# metric_name: Metadata remains available, even if the data is no longer available. +# description: This metric determines if the metadata will be preserved even when the data they represent are no longer available, replaced or lost. +# fair_principle: A2 +# target: Metadata +# evaluation_mechanism: Currently this metric can only be assessed using the persistent identifier as an indicator. DOI metadata is preserved by DataCite. +# metric_tests: +# - metric_test_identifier: FsF-A2-01M-1 +# metric_test_name: The persistent identifier system used guarantees the preservation of associated metadata +# metric_test_score: 1 +# metric_test_maturity: 3 +# created_by: FAIRsFAIR +# date_created: 2020-07-08 +# date_updated: 2020-12-05 +# version: 0.5 +# total_score: 1 - metric_identifier: FsF-I1-01M metric_number: 10 @@ -338,7 +334,7 @@ metrics: modality: any required: name: - - meta_tags + - meta_tag - microdata - rdfa - json_in_html @@ -459,8 +455,11 @@ metrics: - metric_test_identifier: FsF-R1-01MD-2b metric_test_name: Measured variables or observation types are specified in metadata metric_test_score: 0 + - metric_test_identifier: FsF-R1-01MD-2c + metric_test_name: Data service endpoint and protocol information are specified in metadata + metric_test_score: 0 - metric_test_identifier: FsF-R1-01MD-3 - metric_test_name: Data content matches file type and size specified in metadata + metric_test_name: Data content matches file type and size or protocol specified in metadata metric_test_score: 1 metric_test_maturity: 3 - metric_test_identifier: FsF-R1-01MD-4 diff --git a/fuji_server/yaml/metrics_v0.5ss.yaml b/fuji_server/yaml/metrics_v0.5ss.yaml new file mode 100644 index 00000000..013e12d6 --- /dev/null +++ b/fuji_server/yaml/metrics_v0.5ss.yaml @@ -0,0 +1,179 @@ +# LIST OF FARISFAIR METRICS AND THEIR RESPONSE OUTPUT FORMATS +config: + metric_specification: https://doi.org/10.5281/zenodo.4081213 + metric_status: valid +metrics: + ## ---------------- FINDABILITY ---------------- ## +- metric_identifier: FsF-F2-01M-ss + metric_number: 3 + metric_short_name: Descriptive Core Metadata + metric_name: Metadata includes descriptive core elements (title, identifier, publisher, abstract and language) relevant for the social sciences to support data findability. + description: Metadata is descriptive information about a data object. Since the metadata required differs depending on the users and their applications, this metric focuses on core metadata. The social science community has defined specific requirements for core metadata and the individual content to be described with it defined in the CESSDA Metadata Model (CMM). These are community specific with respect to certain properties but coincide to a large extent with domain agnostic specifications such as common data citation guidelines (e.g., DataCite, ESIP, and IASSIST), and metadata recommendations for data discovery (e.g., EOSC Datasets Minimum Information (EDMI), DataCite Metadata Schema, W3C Recommendation Data on the Web Best Practices and Data Catalog Vocabulary). Core descriptive metadata for social sciences data are title, identifier, publisher, abstract and language. + fair_principle: F2 + target: Metadata + evaluation_mechanism: Use the data identifier to access its metadata document. Parse or retrieve core metadata, e.g., through one or more options below, combine the results and then verify presence/absence of the core elements in the metadata. Structured data embedded in the landing page of the identifier (e.g., Schema.org, Dublin Core meta tags or RDFa metadata) Typed Links in the HTTP Link header leading to DDI or compatible metadata; for more information, see https://signposting.org/conventions/ Content negotiation (including external negotiation services offered by PID providers) to retrieve DDI metadata or a compatible standard. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-F2-01M-1-ss + metric_test_name: Metadata has been made available via common web methods + metric_test_score: 1 + metric_test_maturity: 1 + - metric_test_identifier: FsF-F2-01M-3-ss + metric_test_name: Core descriptive metadata is available + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: all + target: https://f-uji.net/vocab/metadata/property + required: + name: + - title + - object_identifier + - publisher + - abstract + - language + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2022-05-30 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-F4-01M-ss + metric_number: 5 + metric_short_name: Searchable Metadata + metric_name: Metadata is offered in such a way that it can be retrieved programmatically. + description: This metric refers to ways through which the metadata of data is exposed or provided in a standard and machine-readable format. Assessing this metric will require an understanding of the capabilities offered by the data repository used to host the data. Metadata may be available through multiple endpoints. For example, if data is hosted by a repository, the repository may disseminate its metadata through a metadata harvesting protocol (e.g., via OAI-PMH) and/or a web service. Metadata may also be embedded as structured data on a data page for use by web search engines such as Google and Bing or be available as linked (open) data. + fair_principle: F4 + target: Metadata + evaluation_mechanism: The metric is evaluated using the given metadata standards known to support major search engines such as JSON-LD and Dublin Core. Presence of metadata in research data registries is further evaluated. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-F4-01M-3-ss + metric_test_name: Metadata is offered via metadata exchange standard interface (OAI-PMH) + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/exchange_service + required: + name: + - OAI-PMH + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2022-05-30 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-I2-01M-ss + metric_number: 11 + metric_short_name: Metadata with Semantic Resources + metric_name: Metadata uses semantic resources + description: A metadata document or selected parts of the document may incorporate additional terms from semantic resources (also referred as semantic artefacts) so that the contents are unambiguous and can be processed automatically by machines. This enrichment facilitates enhanced data search and interoperability of data from different sources. Ontology, thesaurus, and taxonomy are kinds of semantic resources, and they come with varying degrees of expressiveness and computational complexity. Knowledge organization schemes such as thesaurus and taxonomy are semantically less formal than ontologies. + fair_principle: I2 + target: Metadata + evaluation_mechanism: Used namespaces are identified in given graph or XML metadata and verified using a controlled list. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-I2-01M-2-ss + metric_test_name: Namespaces of known semantic resources can be identified in metadata + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - target_property: uri_format + target: https://f-uji.net/vocab/metadata/semantic_resource + modality: any + match: wildcard + required: + identifier: + - https://vocabularies.cessda.eu/* + - http://rdf-vocabulary.ddialliance.org/* + - https://www.gesis.org/vocabulary/* + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-R1.1-01M-ss + metric_number: 14 + metric_short_name: Data Usage License + metric_name: Metadata includes license information under which data can be reused. + description: This metric evaluates if data is associated with a license because otherwise users cannot reuse it in a clear legal context. We encourage the application of licenses for all kinds of data whether public, restricted or for specific users. Without an explicit license, users do not have a clear idea of what can be done with your data. Licenses can be of standard type (Creative Commons, Open Data Commons Open Database License) or bespoke licenses, and rights statements which indicate the conditions under which data can be reused. It is highly recommended to use a standard, machine-readable license such that it can be interpreted by machines and humans. In order to inform users about what rights they have to use a dataset, the license information should be specified as part of the dataset’s metadata. + fair_principle: R1.1 + target: Metadata + evaluation_mechanism: Metric evaluation is based on the presence of a machine readable license information in an appropriate metadata element/field. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-R1.1-01M-1-ss + metric_test_name: Licence information is given in an appropriate metadata element + metric_test_score: 1 + metric_test_maturity: 1 + - metric_test_identifier: FsF-R1.1-01M-2-ss + metric_test_name: Recognized licence is valid (community specific or registered at SPDX) + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/licenses + match: wildcard + required: + name: + - CC-BY* + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2023-06-02 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-R1.3-01M-ss + metric_number: 16 + metric_short_name: Community-Endorsed Metadata Standard + metric_name: Metadata follows a standard recommended by the target research community of the data. + description: In addition to core metadata required to support data discovery (covered under metric FsF-F2-01M), metadata to support data reusability should be made available following community-endorsed metadata standards. Some communities have well-established metadata standards (e.g., geospatial [ISO19115], biodiversity [DarwinCore, ABCD, EML], social science [DDI], astronomy [International Virtual Observatory Alliance Technical Specifications]) while others have limited standards or standards that are under development (e.g., engineering and linguistics). The use of community-endorsed metadata standards is usually encouraged and supported by domain and discipline-specific repositories. + fair_principle: R1.3 + target: Metadata + evaluation_mechanism: Metadata encodings can be verified using community specific namespaces and schemas listed by the RDA metadata standards WG or fairsharing.org + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-R1.3-01M-1-ss + metric_test_name: Community specific metadata standard is detected using namespaces or schemas found in provided metadata or metadata services outputs + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + name: + - ddi-data-documentation-initiative + - dara-metadata-schema + - metric_test_identifier: FsF-R1.3-01M-3-ss + metric_test_name: Multidisciplinary but community endorsed metadata standard is listed in the re3data record or detected by namespace + metric_test_score: 1 + metric_test_maturity: 1 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + name: + - datacite-metadata-schema + - dcat-data-catalog-vocabulary + - schemaorg + - dublin-core + - metric_test_identifier: FsF-R1.3-01M-2-ss + metric_test_name: Community specific metadata standard is listed in the re3data record of the responsible repository + metric_test_score: 1 + metric_test_maturity: 2 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + - ddi-data-documentation-initiative + - dara-metadata-schema + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 diff --git a/fuji_server/yaml/metrics_v0.5ssv2.yaml b/fuji_server/yaml/metrics_v0.5ssv2.yaml new file mode 100644 index 00000000..1022a7f3 --- /dev/null +++ b/fuji_server/yaml/metrics_v0.5ssv2.yaml @@ -0,0 +1,584 @@ +# LIST OF FARISFAIR METRICS AND THEIR RESPONSE OUTPUT FORMATS +config: + metric_specification: https://doi.org/10.5281/zenodo.4081213 + metric_status: valid +metrics: + +- metric_identifier: FsF-F1-01D + metric_number: 1 + metric_short_name: Unique Identifier + metric_name: Data is assigned a globally unique identifier. + description: A data object may be assigned with a globally unique identifier such that it can be referenced unambiguously by humans or machines. Globally unique means an identifier should be associated with only one resource at any time. Examples of unique identifiers of data are Internationalized Resource Identifier (IRI), Uniform Resource Identifier (URI) such as URL and URN, Digital Object Identifier (DOI), the Handle System, identifiers.org, w3id.org and Archival Resource Key (ARK). A data repository may assign a globally unique identifier to your data or metadata when you publish and make it available through their services. + fair_principle: F1 + target: Data + evaluation_mechanism: Identifier is considered unique if it is successfully validated through https://pythonhosted.org/IDUtils/. Supported schemes are ISBN10, ISBN13, ISSN, ISTC, DOI, Handle, EAN8, EAN13, ISNI ORCID, ARK, PURL, LSID, URN, Bibcode, arXiv, PubMed ID, PubMed Central ID, GND. + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-F1-01D-1 + metric_test_name: Identifier is resolvable and follows a defined unique identifier syntax (IRI, URL) + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - target: https://f-uji.net/vocab/identifier + tested_on: https://f-uji.net/vocab/metadata/property/object_identifier + modality: any + comment: identifier can be given as user input + - metric_test_identifier: FsF-F1-01D-2 + metric_test_name: Identifier is not resolvable but follows an UUID or HASH type syntax + metric_test_score: 0.5 + metric_test_maturity: 1 + metric_test_requirements: + - target: https://f-uji.net/vocab/identifier/unique + tested_on: https://f-uji.net/vocab/metadata/property/object_identifier + modality: any + required: + name: + - uuid + - hash + comment: identifier can be given as user input + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-11-25 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-F1-02D + metric_number: 2 + metric_short_name: Persistent Identifier + metric_name: Data is assigned a persistent identifier. + description: We make a distinction between the uniqueness and persistence of an identifier. An HTTP URL (the address of a given unique resource on the web) is globally unique, but may not be persistent as the URL of data may be not accessible (link rot problem) or the data available under the original URL may be changed (content drift problem). Identifiers based on the Handle System, DOI, ARK are both globally unique and persistent. They are maintained and governed such that they remain stable and resolvable for the long term. The persistent identifier (PID) of a data object may be resolved (point) to a landing page with metadata containing further information on how to access the data content, in some cases a downloadable artefact, or none if the data or repository is no longer maintained. Therefore, ensuring persistence is a shared responsibility between a PID service provider (e.g., datacite) and its clients (e.g., data repositories). For example, the DOI system guarantees the persistence of its identifiers through its social (e.g., policy) and technical infrastructures, whereas a data provider ensures the availability of the resource (e.g., landing page, downloadable artefact) associated with the identifier. + fair_principle: F1 + target: Data + evaluation_mechanism: A persistent identifier is considered to be valid if the given identifier complies with a valid PID synthax. To be valid, the PID further has to be resolvable. + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-F1-02D-1 + metric_test_name: Identifier follows a defined persistent identifier syntax + metric_test_score: 0.5 + metric_test_maturity: 1 + metric_test_requirements: + - target: https://f-uji.net/vocab/identifier/persistent + tested_on: https://f-uji.net/vocab/metadata/property/object_identifier + modality: any + comment: identifier can be given as user input + - metric_test_identifier: FsF-F1-02D-2 + metric_test_name: Persistent identifier is resolvable + metric_test_requirements: + - target: https://f-uji.net/vocab/identifier/persistent + tested_on: https://f-uji.net/vocab/metadata/property/object_identifier + comment: identifier has to resolve to a valid URI + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-11-25 + version: 0.5 + total_score: 1 + ## ---------------- FINDABILITY ---------------- ## +- metric_identifier: FsF-F2-01M-ss + metric_number: 3 + metric_short_name: Descriptive Core Metadata + metric_name: Metadata includes descriptive core elements (title, identifier, publisher, abstract and language) relevant for the social sciences to support data findability. + description: Metadata is descriptive information about a data object. Since the metadata required differs depending on the users and their applications, this metric focuses on core metadata. The social science community has defined specific requirements for core metadata and the individual content to be described with it defined in the CESSDA Metadata Model (CMM). These are community specific with respect to certain properties but coincide to a large extent with domain agnostic specifications such as common data citation guidelines (e.g., DataCite, ESIP, and IASSIST), and metadata recommendations for data discovery (e.g., EOSC Datasets Minimum Information (EDMI), DataCite Metadata Schema, W3C Recommendation Data on the Web Best Practices and Data Catalog Vocabulary). Core descriptive metadata for social sciences data are title, identifier, publisher, abstract and language. + fair_principle: F2 + target: Metadata + evaluation_mechanism: Use the data identifier to access its metadata document. Parse or retrieve core metadata, e.g., through one or more options below, combine the results and then verify presence/absence of the core elements in the metadata. Structured data embedded in the landing page of the identifier (e.g., Schema.org, Dublin Core meta tags or RDFa metadata) Typed Links in the HTTP Link header leading to DDI or compatible metadata; for more information, see https://signposting.org/conventions/ Content negotiation (including external negotiation services offered by PID providers) to retrieve DDI metadata or a compatible standard. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-F2-01M-1-ss + metric_test_name: Metadata has been made available via common web methods + metric_test_score: 1 + metric_test_maturity: 1 + - metric_test_identifier: FsF-F2-01M-3-ss + metric_test_name: Core descriptive metadata is available + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: all + target: https://f-uji.net/vocab/metadata/property + required: + name: + - title + - object_identifier + - publisher + - abstract + - language + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2022-05-30 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-F3-01M + metric_number: 4 + metric_short_name: Inclusion of Data Identifier in Metadata + metric_name: Metadata includes the identifier of the data it describes. + description: The metadata should explicitly specify the identifier of the data such that users can discover and access the data through the metadata. If the identifier specified is persistent and points to a landing page, the data identifier and links to download the data content should be taken into account in the assessment. + fair_principle: F3 + target: Metadata + evaluation_mechanism: Several metadata standards provide the possibility to include links to the actual data content. The presence of such links is evaluated here. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-F3-01M-1 + metric_test_name: Metadata contains data content related information (file name, size, type) + metric_test_score: 0.5 + metric_test_maturity: 1 + metric_test_requirements: + - target: https://f-uji.net/vocab/data/property + tested_on: https://f-uji.net/vocab/metadata/property/object_content_identifier + modality: all + required: + - type + - size + - metric_test_identifier: FsF-F3-01M-2 + metric_test_name: Metadata contains a PID or URL which indicates the location of the downloadable data content + metric_test_score: 0.5 + metric_test_maturity: 3 + metric_test_requirements: + - target: https://f-uji.net/vocab/data/property/url + tested_on: https://f-uji.net/vocab/metadata/property/object_content_identifier + modality: any + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2022-05-30 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-F4-01M-ss + metric_number: 5 + metric_short_name: Searchable Metadata + metric_name: Metadata is offered in such a way that it can be retrieved programmatically. + description: This metric refers to ways through which the metadata of data is exposed or provided in a standard and machine-readable format. Assessing this metric will require an understanding of the capabilities offered by the data repository used to host the data. Metadata may be available through multiple endpoints. For example, if data is hosted by a repository, the repository may disseminate its metadata through a metadata harvesting protocol (e.g., via OAI-PMH) and/or a web service. Metadata may also be embedded as structured data on a data page for use by web search engines such as Google and Bing or be available as linked (open) data. + fair_principle: F4 + target: Metadata + evaluation_mechanism: The metric is evaluated using the given metadata standards known to support major search engines such as JSON-LD and Dublin Core. Presence of metadata in research data registries is further evaluated. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-F4-01M-3-ss + metric_test_name: Metadata is offered via metadata exchange standard interface (OAI-PMH) + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/exchange_service + required: + name: + - OAI-PMH + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2022-05-30 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-A1-01M + metric_number: 6 + metric_short_name: Data Access Information + metric_name: Metadata contains access level and access conditions of the data. + description: This metric determines if the metadata includes the level of access to the data such as public, embargoed, restricted, or metadata-only access and its access conditions. Both access level and conditions are necessary information to potentially gain access to the data. It is recommended that data should be as open as possible and as closed as necessary. There are no access conditions for public data. Datasets should be released into the public domain (e.g., with an appropriate public-domain-equivalent license such as Creative Commons CC0 licence) and openly accessible without restrictions when possible. Embargoed access refers to data that will be made publicly accessible at a specific date which should be specified in the metadata. For example, a data author may release their data after having published their findings from the data. Therefore, access conditions such as the date the data will be released publically is essential. Restricted access refers to data that can be accessed under certain conditions (e.g. because of commercial, sensitive, or other confidentiality reasons or the data is only accessible via a subscription or a fee). Restricted data may be available to a particular group of users or after permission is granted. For restricted data, the metadata should include the conditions of access to the data such as point of contact or instructions to access the data. Metadata-only access refers to data that is not made publicly available and for which only metadata is publicly available. + fair_principle: A1 + target: Metadata + evaluation_mechanism: Metric evaluation is based on the presence of access information in an appropriate metadata element/field. + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-A1-01M-1 + metric_test_name: Information about access restrictions or rights can be identified in metadata + metric_test_score: 0.5 + metric_test_maturity: 1 + metric_test_requirements: + - target: http://f-uji.net/vocab/metadata/property/access_level + modality: any + - metric_test_identifier: FsF-A1-01M-3 + metric_test_name: Data access information is indicated by (not machine readable) standard terms + metric_test_score: 1 + metric_test_maturity: 2 + metric_test_requirements: + - target: http://f-uji.net/vocab/access_condition + modality: any + tested_on: http://f-uji.net/vocab/metadata/property/access_level + comment: label and id + - metric_test_identifier: FsF-A1-01M-2 + metric_test_name: Data access information is machine readable + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - target: http://f-uji.net/vocab/access_condition + modality: any + tested_on: http://f-uji.net/vocab/metadata/property/access_level + comment: identifier (namespace) + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-A1-03D + metric_number: 8 + metric_short_name: Standardized Communication Protocol of Data + metric_name: Data is accessible through a standardized communication protocol. + description: Given an identifier of a dataset, the dataset should be retrievable using a standard communication protocol such as HTTP, HTTPS, FTP, TFTP, SFTP, FTAM and AtomPub. Avoid disseminating data using a proprietary protocol. + fair_principle: A1 + target: Data + evaluation_mechanism: The data link which is given in the metadata is tested for an standard communication protocol + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-A1-03D-1 + metric_test_name: Metadata includes a resolvable link to data based on standardized web communication protocols. + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-10-23 + date_updated: 2020-12-05 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-A1-02M + metric_number: 7 + metric_short_name: Standardized Communication Protocol of Metadata + metric_name: Metadata is accessible through a standardized communication protocol. + description: Given an identifier of a dataset, the metadata of the dataset should be retrievable using a standard communication protocol such as HTTP, HTTPS, FTP, TFTP, SFTP, FTAM and AtomPub. Avoid disseminating data using a proprietary protocol. + fair_principle: A1 + target: Metadata + evaluation_mechanism: The URI scheme of the landing page is tested for a standard communication protocol + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-A1-02M-1 + metric_test_name: Landing page link is based on standardized web communication protocols. + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-10-23 + date_updated: 2020-12-05 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-A2-01M + metric_number: 9 + metric_short_name: Metadata Preservation + metric_name: Metadata remains available, even if the data is no longer available. + description: This metric determines if the metadata will be preserved even when the data they represent are no longer available, replaced or lost. + fair_principle: A2 + target: Metadata + evaluation_mechanism: Currently this metric can only be assessed using the persistent identifier as an indicator. DOI metadata is preserved by DataCite. + metric_tests: + - metric_test_identifier: FsF-A2-01M-1 + metric_test_name: The persistent identifier system used guarantees the preservation of associated metadata + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-05 + version: 0.5 + total_score: 1 + + +- metric_identifier: FsF-I1-01M + metric_number: 10 + metric_short_name: Formal Representation of Metadata + metric_name: Metadata is represented using a formal knowledge representation language. + description: Knowledge representation is vital for machine-processing of the knowledge of a domain. Expressing the metadata of a data object using a formal knowledge representation will enable machines to process it in a meaningful way and enable more data exchange possibilities. Examples of knowledge representation languages are RDF, RDFS, and OWL. These languages may be serialized (written) in different formats. For instance, RDF/XML, RDFa, Notation3, Turtle, N-Triples and N-Quads, and JSON-LD are RDF serialization formats. + fair_principle: I1 + target: Metadata + evaluation_mechanism: Metadata has to be serialised in a common formal knowledge representation language. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-I1-01M-1 + metric_test_name: Parsable, structured metadata (JSON-LD, RDFa) is embedded in the landing page XHTML/HTML code + metric_test_score: 1 + metric_test_maturity: 2 + metric_test_requirements: + - target: http://f-uji.net/vocab/metadata/format + modality: any + required: + name: + - RDF + - JSON-LD + - RDFa + - target: http://f-uji.net/vocab/metadata/offering_method + modality: any + required: + name: + - meta_tags + - microdata + - rdfa + - metric_test_identifier: FsF-I1-01M-2 + metric_test_name: Parsable, graph data (RDF, JSON-LD) is accessible through content negotiation, typed links or sparql endpoint + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - target: http://f-uji.net/vocab/metadata/format + modality: any + required: + name: + - RDF + - JSON-LD + - RDFa + - target: http://f-uji.net/vocab/metadata/offering_method + modality: any + required: + name: + - content_negotiation + - target: http://f-uji.net/vocab/metadata/exchange_service + modality: any + required: + name: + - sparql + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2023-06-01 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-I2-01M-ss + metric_number: 11 + metric_short_name: Metadata with Semantic Resources + metric_name: Metadata uses semantic resources + description: A metadata document or selected parts of the document may incorporate additional terms from semantic resources (also referred as semantic artefacts) so that the contents are unambiguous and can be processed automatically by machines. This enrichment facilitates enhanced data search and interoperability of data from different sources. Ontology, thesaurus, and taxonomy are kinds of semantic resources, and they come with varying degrees of expressiveness and computational complexity. Knowledge organization schemes such as thesaurus and taxonomy are semantically less formal than ontologies. + fair_principle: I2 + target: Metadata + evaluation_mechanism: Used namespaces are identified in given graph or XML metadata and verified using a controlled list. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-I2-01M-2-ss + metric_test_name: Namespaces of known semantic resources can be identified in metadata + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - target_property: uri_format + target: https://f-uji.net/vocab/metadata/semantic_resource + modality: any + match: wildcard + required: + identifier: + - https://vocabularies.cessda.eu/* + - http://rdf-vocabulary.ddialliance.org/* + - https://www.gesis.org/vocabulary/* + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 + + +- metric_identifier: FsF-I3-01M + metric_number: 12 + metric_short_name: Links to related entities + metric_name: Metadata includes links between the data and its related entities. + description: Linking data to its related entities will increase its potential for reuse. The linking information should be captured as part of the metadata. A dataset may be linked to its prior version, related datasets or resources (e.g. publication, physical sample, funder, repository, platform, site, or observing network registries). Links between data and its related entities should be expressed through relation types (e.g., DataCite Metadata Schema specifies relation types between research objects through the fields ‘RelatedIdentifier’ and ‘RelationType’), and preferably use persistent Identifiers for related entities (e.g., ORCID for contributors, DOI for publications, and ROR for institutions). + fair_principle: I3 + target: Metadata + evaluation_mechanism: Metadata is checked for existing relations to related entities which can be e.g. citations or other related resources + metric_tests: + - metric_test_identifier: FsF-I3-01M-1 + metric_test_name: Related resources are explicitly mentioned in metadata + metric_test_score: 1 + metric_test_maturity: 2 + metric_test_requirements: + - target: http://f-uji.net/vocab/relation_type + modality: any + tested_on: http://f-uji.net/vocab/metadata/property/related_resources + comment: The presence of a (typed, default = related) related resource is checked, can be a string or URI + - metric_test_identifier: FsF-I3-01M-2 + metric_test_name: Related resources are indicated by machine readable links or identifiers + metric_test_requirements: + - comment: same as above but relations have to be machine readable/actionable + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 + +- metric_identifier: FsF-R1-01MD + metric_number: 13 + metric_short_name: Metadata of Data Content + metric_name: Metadata specifies the content of the data. + description: This metric evaluates if a description (properties) of the content of the data is specified in the metadata. The description should be an accurate reflection of the actual data deposited. Data content descriptors include but are not limited to resource type (e.g., data or a collection of data), variable(s) measured or observed, method, data format and size. Ideally, ontological vocabularies should be used to describe data content to support interdisciplinary reuse. + fair_principle: R1 + target: Metadata, Data + evaluation_mechanism: Metric is evaluated using the resource type given in the metadata as well as data object specific properties file size and file type. Further presence of measured variables is tested. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-R1-01MD-1_ss + metric_test_name: Minimal information about available data content is given in metadata + metric_test_score: 1 + metric_test_maturity: 1 + - metric_test_identifier: FsF-R1-01MD-1a + metric_test_name: Resource type (e.g. dataset) is given in metadata + metric_test_score: 0 + - metric_test_identifier: FsF-R1-01MD-1b + metric_test_name: Information about data content (e.g. links) is given in metadata + metric_test_score: 0 + - metric_test_identifier: FsF-R1-01MD-2 + metric_test_name: Verifiable data descriptors (file info, measured variables or observation types) are specified in metadata + metric_test_score: 1 + metric_test_maturity: 2 + - metric_test_identifier: FsF-R1-01MD-2a + metric_test_name: File size and type information are specified in metadata + metric_test_score: 0 + - metric_test_identifier: FsF-R1-01MD-2b + metric_test_name: Measured variables or observation types are specified in metadata + metric_test_score: 0 + - metric_test_identifier: FsF-R1-01MD-3 + metric_test_name: Data content matches file type and size specified in metadata + metric_test_score: 1 + metric_test_maturity: 3 + - metric_test_identifier: FsF-R1-01MD-4 + metric_test_name: Data content matches measured variables or observation types specified in metadata + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-07-08 + version: 0.5 + total_score: 4 + +- metric_identifier: FsF-R1.1-01M-ss + metric_number: 14 + metric_short_name: Data Usage License + metric_name: Metadata includes license information under which data can be reused. + description: This metric evaluates if data is associated with a license because otherwise users cannot reuse it in a clear legal context. We encourage the application of licenses for all kinds of data whether public, restricted or for specific users. Without an explicit license, users do not have a clear idea of what can be done with your data. Licenses can be of standard type (Creative Commons, Open Data Commons Open Database License) or bespoke licenses, and rights statements which indicate the conditions under which data can be reused. It is highly recommended to use a standard, machine-readable license such that it can be interpreted by machines and humans. In order to inform users about what rights they have to use a dataset, the license information should be specified as part of the dataset’s metadata. + fair_principle: R1.1 + target: Metadata + evaluation_mechanism: Metric evaluation is based on the presence of a machine readable license information in an appropriate metadata element/field. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-R1.1-01M-1-ss + metric_test_name: Licence information is given in an appropriate metadata element + metric_test_score: 1 + metric_test_maturity: 1 + - metric_test_identifier: FsF-R1.1-01M-2-ss + metric_test_name: Recognized licence is valid (community specific or registered at SPDX) + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/licenses + match: wildcard + required: + name: + - CC-BY* + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2023-06-02 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-R1.2-01M + metric_number: 15 + metric_short_name: Data Provenance + metric_name: Metadata includes provenance information about data creation or generation. + description: >- + Data provenance (also known as lineage) represents a dataset’s history, including the people, entities, and processes involved in its creation, management and longer-term curation. It is essential to provide provenance information about your data to provide valuable context and to enable informed use and reuse. The levels of provenance information needed can vary depending on the data type (e.g., measurement, observation, derived data, or data product) and research domains. For that reason, it is difficult to define a set of finite provenance properties that will be adequate for all domains. Based on existing work, we suggest that the following provenance properties of data generation or collection are included in the metadata record as a minimum. + (a) Sources of data, e.g., datasets the data is derived from and instruments + (b) Data creation or collection date + (c) Contributors involved in data creation and their roles + (d) Data publication, modification and versioning information + There are various ways through which provenance information may be included in a metadata record. Some of the provenance properties (e.g., instrument, contributor) may be best represented using PIDs (such as DOIs for data, ORCIDs for researchers). + This way, humans and systems can retrieve more information about each of the properties by resolving the PIDs. Alternatively, the provenance information can be given in a linked provenance record expressed explicitly in e.g., PROV-O or PAV or Vocabulary of Interlinked Datasets (VoID). + fair_principle: R1.2 + target: Metadata + evaluation_mechanism: Metrics are assessed using provenance related information contained in metadata which can either be specific elements which can be mapped e.g. to PROV-O or the use of provenance related namespaces and associated terms. + test_scoring_mechanism: cumulative + metric_tests: + - metric_test_identifier: FsF-R1.2-01M-1 + metric_test_name: Metadata contains elements which hold provenance information and can be mapped to PROV + metric_test_score: 1 + metric_test_maturity: 2 + - metric_test_identifier: FsF-R1.2-01M-2 + metric_test_name: Metadata contains provenance information using formal provenance ontologies (PROV-O) + metric_test_score: 1 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2023-06-01 + version: 0.5 + total_score: 2 + +- metric_identifier: FsF-R1.3-01M-ss + metric_number: 16 + metric_short_name: Community-Endorsed Metadata Standard + metric_name: Metadata follows a standard recommended by the target research community of the data. + description: In addition to core metadata required to support data discovery (covered under metric FsF-F2-01M), metadata to support data reusability should be made available following community-endorsed metadata standards. Some communities have well-established metadata standards (e.g., geospatial [ISO19115], biodiversity [DarwinCore, ABCD, EML], social science [DDI], astronomy [International Virtual Observatory Alliance Technical Specifications]) while others have limited standards or standards that are under development (e.g., engineering and linguistics). The use of community-endorsed metadata standards is usually encouraged and supported by domain and discipline-specific repositories. + fair_principle: R1.3 + target: Metadata + evaluation_mechanism: Metadata encodings can be verified using community specific namespaces and schemas listed by the RDA metadata standards WG or fairsharing.org + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-R1.3-01M-1-ss + metric_test_name: Community specific metadata standard is detected using namespaces or schemas found in provided metadata or metadata services outputs + metric_test_score: 1 + metric_test_maturity: 3 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + name: + - ddi-data-documentation-initiative + - dara-metadata-schema + - metric_test_identifier: FsF-R1.3-01M-3-ss + metric_test_name: Multidisciplinary but community endorsed metadata standard is listed in the re3data record or detected by namespace + metric_test_score: 1 + metric_test_maturity: 1 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + name: + - datacite-metadata-schema + - dcat-data-catalog-vocabulary + - schemaorg + - dublin-core + - metric_test_identifier: FsF-R1.3-01M-2-ss + metric_test_name: Community specific metadata standard is listed in the re3data record of the responsible repository + metric_test_score: 1 + metric_test_maturity: 2 + metric_test_requirements: + - modality: any + target: https://f-uji.net/vocab/metadata/standards + match: full + required: + - ddi-data-documentation-initiative + - dara-metadata-schema + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 +- metric_identifier: FsF-R1.3-02D + metric_number: 17 + metric_short_name: Data File format + metric_name: Data is available in a file format recommended by the target research community. + description: >- + File formats refer to methods for encoding digital information. For example, CSV for tabular data, NetCDF for multidimensional data and GeoTIFF for raster imagery. Data should be made available in a file format that is backed by the research community to enable data sharing and reuse. Consider for example, file formats that are widely used and supported by the most commonly used software and tools. These formats also should be suitable for long-term storage and archiving, which are usually recommended by a data repository. The formats not only give a higher certainty that your data can be read in the future, but they will also help to increase the reusability and interoperability. Using community-endorsed formats enables data to be loaded directly into the software and tools used for data analysis. It makes it possible to easily integrate your data with other data using the same preferred format. The use of preferred formats will also help to transform the format to a newer one, in case a preferred format gets outdated. + Similar to metric FsF-F4-01M, answering this metric will require an understanding of the capabilities offered, data preservation plan and policies implemented by the data repository and data services (e.g., Datacite PID service). + Continued access to metadata depends on a data repository’s preservation practice which is usually documented in the repository’s service policies or statements. + A trustworthy data repository offering DOIs and implementing a PID Policy should guarantee that metadata will remain accessible even when data is no longer available for any reason (e.g., by providing a tombstone page). + fair_principle: R1.3 + target: Data + evaluation_mechanism: Data file format given in metadata is compared to a controlled list of known scientific formats. + test_scoring_mechanism: alternative + metric_tests: + - metric_test_identifier: FsF-R1.3-02D-1 + metric_test_name: The format of a data file given in the metadata is listed in the long term file formats, open file formats or scientific file formats controlled list + metric_test_score: 1 + - metric_test_identifier: FsF-R1.3-02D-1a + metric_test_name: The format of the data file is an open format + metric_test_score: 0 + metric_test_maturity: 1 + - metric_test_identifier: FsF-R1.3-02D-1b + metric_test_name: The format of the data file is a long term format + metric_test_score: 0 + metric_test_maturity: 2 + - metric_test_identifier: FsF-R1.3-02D-1c + metric_test_name: The format of the data file is a scientific format + metric_test_score: 0 + metric_test_maturity: 3 + created_by: FAIRsFAIR + date_created: 2020-07-08 + date_updated: 2020-12-03 + version: 0.5 + total_score: 1 + metric_specification: 10.5281/zenodo.6461229 diff --git a/pyproject.toml b/pyproject.toml index 0eaa940d..d0e7014d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ license = "MIT" name = "fuji" readme = "README.md" requires-python = "~=3.11" # at the moment only Python 3.11 is supported -version = "3.2.2" +version = "3.3.0" [project.optional-dependencies] docs = [