Skip to content

Commit

Permalink
updated +- to pr480
Browse files Browse the repository at this point in the history
  • Loading branch information
huberrob committed Sep 25, 2024
2 parents f4e5df0 + 3ca81ab commit 1566637
Show file tree
Hide file tree
Showing 21 changed files with 1,530 additions and 486 deletions.
5 changes: 3 additions & 2 deletions fuji_server/config/server.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ debug_mode = true
data_files_limit = 5
log_config = config/logging.ini
logdir = logs
verify_pids = false
# the URI which triggers the remote logging all other F-UJI server requests are ignored
remote_log_host = fuji.localhost
remote_log_path = /loghandler/index.php
remote_log_host =
remote_log_path =
rate_limit = 100 per minute
# limits the maximum size of content (metadata) which can be downloaded
max_content_size = 5000000
Expand Down
157 changes: 84 additions & 73 deletions fuji_server/controllers/fair_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import hashlib
import io
import json
import logging
import logging.handlers
import re
Expand Down Expand Up @@ -50,7 +51,7 @@
from fuji_server.harvester.data_harvester import DataHarvester
from fuji_server.harvester.github_harvester import GithubHarvester
from fuji_server.harvester.metadata_harvester import MetadataHarvester
from fuji_server.helper.linked_vocab_helper import linked_vocab_helper
from fuji_server.helper.linked_vocab_helper import LinkedVocabHelper
from fuji_server.helper.metadata_collector import MetadataOfferingMethods
from fuji_server.helper.metadata_mapper import Mapper
from fuji_server.helper.metric_helper import MetricHelper
Expand All @@ -70,7 +71,7 @@ class FAIRCheck:
LONG_TERM_FILE_FORMATS = None
OPEN_FILE_FORMATS = None
DEFAULT_NAMESPACES = None
VOCAB_NAMESPACES = None
# VOCAB_NAMESPACES = None
ARCHIVE_MIMETYPES = Mapper.ARCHIVE_COMPRESS_MIMETYPES.value
STANDARD_PROTOCOLS = None
SCHEMA_ORG_CONTEXT = []
Expand Down Expand Up @@ -184,7 +185,7 @@ def __init__(
FAIRCheck.load_predata()
# self.extruct = None
self.extruct_result = {}
self.lov_helper = linked_vocab_helper(self.LINKED_VOCAB_INDEX)
self.lov_helper = LinkedVocabHelper(self.LINKED_VOCAB_INDEX)
self.auth_token = None
self.auth_token_type = "Basic"

Expand Down Expand Up @@ -241,8 +242,8 @@ def load_predata(cls):
cls.OPEN_FILE_FORMATS = Preprocessor.get_open_file_formats()
if not cls.DEFAULT_NAMESPACES:
cls.DEFAULT_NAMESPACES = Preprocessor.getDefaultNamespaces()
if not cls.VOCAB_NAMESPACES:
cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs()
# if not cls.VOCAB_NAMESPACES:
# cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs()
if not cls.STANDARD_PROTOCOLS:
cls.STANDARD_PROTOCOLS = Preprocessor.get_standard_protocols()
if not cls.SCHEMA_ORG_CONTEXT:
Expand Down Expand Up @@ -282,43 +283,70 @@ def set_auth_token(self, auth_token, auth_token_type="Basic"):
self.auth_token_type = "Basic"

def clean_metadata(self):
# replace nasty "None" strings by real None
try:
nonerepdict = json.dumps(self.metadata_merged).replace('"None"', "null")
self.metadata_merged = json.loads(nonerepdict)
except:
print("Nasty None replace error")
pass
data_objects = self.metadata_merged.get("object_content_identifier")
if data_objects == {"url": None} or data_objects == [None]:
data_objects = self.metadata_merged["object_content_identifier"] = None
if data_objects is not None:
if not isinstance(data_objects, list):
self.metadata_merged["object_content_identifier"] = [data_objects]

# TODO quick-fix to merge size information - should do it at mapper
if "object_content_identifier" in self.metadata_merged:
if self.metadata_merged.get("object_content_identifier"):
oi = 0
for c in self.metadata_merged["object_content_identifier"]:
if not c.get("size") and self.metadata_merged.get("object_size"):
c["size"] = self.metadata_merged.get("object_size")
# clean mime types in case these are in URI form:
if c.get("type"):
if isinstance(c["type"], list):
c["type"] = c["type"][0]
self.metadata_merged["object_content_identifier"][oi]["type"] = c["type"][0]
mime_parts = str(c.get("type")).split("/")
if len(mime_parts) > 2:
if mime_parts[-2] in [
"application",
"audio",
"font",
"example",
"image",
"message",
"model",
"multipart",
"text",
"video",
]:
self.metadata_merged["object_content_identifier"][oi]["type"] = (
str(mime_parts[-2]) + "/" + str(mime_parts[-1])
)
oi += 1
# duplicate handling
if self.metadata_merged.get("object_content_identifier"):
fdci = {}
for dci in self.metadata_merged.get("object_content_identifier"):
dcurl = dci.get("url")
if dcurl not in fdci:
fdci[dcurl] = dci
else:
# complete size and type
if not fdci[dcurl].get("type") and dci.get("type"):
fdci[dcurl]["type"] = dci.get("type")
if not fdci[dcurl].get("size") and dci.get("size"):
fdci[dcurl]["size"] = dci.get("size")
self.metadata_merged["object_content_identifier"] = [di for di in fdci.values()]

# if "object_content_identifier" in self.metadata_merged:
# if self.metadata_merged.get("object_content_identifier"):
oi = 0
for c in self.metadata_merged["object_content_identifier"]:
if (
not c.get("size")
and self.metadata_merged.get("object_size")
and len(self.metadata_merged["object_content_identifier"]) == 1
):
# c["size"] = self.metadata_merged.get("object_size")
self.metadata_merged["object_content_identifier"][oi]["size"] = self.metadata_merged.get(
"object_size"
)
# clean mime types in case these are in URI form:
if c.get("type"):
if isinstance(c["type"], list):
c["type"] = c["type"][0]
self.metadata_merged["object_content_identifier"][oi]["type"] = c["type"][0]
mime_parts = str(c.get("type")).split("/")
if len(mime_parts) > 2:
if mime_parts[-2] in [
"application",
"audio",
"font",
"example",
"image",
"message",
"model",
"multipart",
"text",
"video",
]:
self.metadata_merged["object_content_identifier"][oi]["type"] = (
str(mime_parts[-2]) + "/" + str(mime_parts[-1])
)
oi += 1
# clean empty entries
for mk, mv in list(self.metadata_merged.items()):
if mv == "" or mv is None:
Expand All @@ -328,7 +356,6 @@ def harvest_all_metadata(self):
# ========= clean merged metadata, delete all entries which are None or ''
self.retrieve_metadata_embedded()
self.retrieve_metadata_external()
self.clean_metadata()
self.logger.info(
"FsF-F2-01M : Type of object described by the metadata -: {}".format(
self.metadata_merged.get("object_type")
Expand Down Expand Up @@ -374,8 +401,8 @@ def harvest_github(self):

def retrieve_metadata_embedded(self):
self.metadata_harvester.retrieve_metadata_embedded()
self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
self.metadata_merged.update(self.metadata_harvester.metadata_merged)
# self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
# self.metadata_merged.update(self.metadata_harvester.metadata_merged)
self.repeat_pid_check = self.metadata_harvester.repeat_pid_check
self.namespace_uri.extend(self.metadata_harvester.namespace_uri)
self.metadata_sources.extend(self.metadata_harvester.metadata_sources)
Expand All @@ -391,8 +418,8 @@ def retrieve_metadata_embedded(self):

def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
self.metadata_harvester.retrieve_metadata_external(target_url, repeat_mode=repeat_mode)
self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
self.metadata_merged.update(self.metadata_harvester.metadata_merged)
# self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
# self.metadata_merged.update(self.metadata_harvester.metadata_merged)
self.repeat_pid_check = self.metadata_harvester.repeat_pid_check
self.namespace_uri.extend(self.metadata_harvester.namespace_uri)
self.metadata_sources.extend(self.metadata_harvester.metadata_sources)
Expand All @@ -403,35 +430,10 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
self.pid_scheme = self.metadata_harvester.pid_scheme
self.pid_collector.update(self.metadata_harvester.pid_collector)

"""def lookup_metadatastandard_by_name(self, value):
found = None
# get standard name with the highest matching percentage using fuzzywuzzy
highest = process.extractOne(value, FAIRCheck.COMMUNITY_METADATA_STANDARDS_NAMES, scorer=fuzz.token_sort_ratio)
if highest[1] > 80:
found = highest[2]
return found
def lookup_metadatastandard_by_uri(self, value):
found = None
if value:
value = str(value).strip().strip('#/')
# try to find it as direct match using http or https as prefix
if value.startswith('http') or value.startswith('ftp'):
value = value.replace('s://', '://')
found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value)
if not found:
found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value.replace('://', 's://'))
if not found:
#fuzzy as fall back
try:
match = process.extractOne(value,
FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.keys())
if extract(str(value)).domain == extract(str(match[1]).domain):
if match[1] > 90:
found = list(FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.values())[match[2]]
except Exception as e:
pass
return found"""
def set_harvested_metadata(self):
self.metadata_unmerged = self.metadata_harvester.metadata_unmerged
self.metadata_merged = self.metadata_harvester.metadata_merged
self.clean_metadata()

def check_unique_metadata_identifier(self):
unique_identifier_check = FAIREvaluatorUniqueIdentifierMetadata(self)
Expand All @@ -450,7 +452,6 @@ def check_persistent_data_identifier(self):
return persistent_identifier_check.getResult()

def check_unique_persistent_metadata_identifier(self):
# self.metadata_harvester.get_signposting_object_identifier()
return self.check_unique_metadata_identifier(), self.check_persistent_metadata_identifier()

def check_unique_persistent_software_identifier(self):
Expand Down Expand Up @@ -692,8 +693,18 @@ def set_repository_uris(self):
self.metadata_merged["publisher"] = [self.metadata_merged.get("publisher")]
for publisher_url in self.metadata_merged.get("publisher"):
if self.uri_validator(publisher_url):
if self.landing_domain in publisher_url:
if self.landing_domain in publisher_url and publisher_url.count("/") <= 3:
self.repository_urls.append(publisher_url)
if self.repository_urls:
self.repository_urls = list(set(self.repository_urls))
# print("REPOSITORY: ", self.repository_urls)
print("REPOSITORY URIS: ", self.repository_urls)

def set_repository_info(self):
self.set_repository_uris()
if self.repository_urls:
for repo_uri in self.repository_urls:
repoharvester = MetadataHarvester(repo_uri)
repoharvester.retrieve_metadata_embedded()
repoharvester.retrieve_metadata_external()
print("########################### REPO METADATA")
print(repoharvester.metadata_merged)
19 changes: 19 additions & 0 deletions fuji_server/controllers/fair_object_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from fuji_server.helper.preprocessor import Preprocessor
from fuji_server.helper.results_exporter import FAIRResultsMapper
from fuji_server.models.fair_results import FAIRResults
from fuji_server.models.harvest_results_metadata import HarvestResultsMetadata


async def assess_by_id(body):
Expand Down Expand Up @@ -80,9 +81,12 @@ async def assess_by_id(body):

print("starting harvesting ")
ft.harvest_all_metadata()
ft.set_harvested_metadata()
uid_result, pid_result = ft.check_unique_persistent_metadata_identifier()
if ft.repeat_pid_check:
ft.retrieve_metadata_external(ft.pid_url, repeat_mode=True)
ft.set_harvested_metadata()
ft.clean_metadata()
ft.harvest_re3_data()
ft.harvest_github()
core_metadata_result = ft.check_minimal_metatadata()
Expand Down Expand Up @@ -210,6 +214,20 @@ async def assess_by_id(body):
idhelper = IdentifierHelper(ft.pid_url)
request["normalized_object_identifier"] = idhelper.get_normalized_id()
results.sort(key=lambda d: d["id"]) # sort results by metric ID
#### metadata summary
harvest_result = []
for metadata in ft.metadata_unmerged:
harvest_result.append(
HarvestResultsMetadata(
metadata.get("offering_method"),
metadata.get("url"),
metadata.get("format"),
metadata.get("schema"),
metadata.get("namespaces"),
metadata.get("metadata"),
)
)
###
final_response = FAIRResults(
request=request,
start_timestamp=starttimestmp,
Expand All @@ -222,6 +240,7 @@ async def assess_by_id(body):
results=results,
summary=summary,
resolved_url=resolved_url,
harvested_metadata=harvest_result,
)
accept_header = connexion.request.headers.get("Accept")
print("ACCEPT HEADER ", accept_header)
Expand Down
Loading

0 comments on commit 1566637

Please sign in to comment.