Skip to content

Commit

Permalink
harvested metadata was not correctly cleaned and merged so sometimes …
Browse files Browse the repository at this point in the history
…there were duplicate entries, this should be fixed now
  • Loading branch information
huberrob committed Aug 16, 2024
1 parent cb7b584 commit ebd3de5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 34 deletions.
41 changes: 7 additions & 34 deletions fuji_server/controllers/fair_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,8 @@ def harvest_github(self):

def retrieve_metadata_embedded(self):
self.metadata_harvester.retrieve_metadata_embedded()
self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
self.metadata_merged.update(self.metadata_harvester.metadata_merged)
# self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
# self.metadata_merged.update(self.metadata_harvester.metadata_merged)
self.repeat_pid_check = self.metadata_harvester.repeat_pid_check
self.namespace_uri.extend(self.metadata_harvester.namespace_uri)
self.metadata_sources.extend(self.metadata_harvester.metadata_sources)
Expand All @@ -408,8 +408,8 @@ def retrieve_metadata_embedded(self):

def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
self.metadata_harvester.retrieve_metadata_external(target_url, repeat_mode=repeat_mode)
self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
self.metadata_merged.update(self.metadata_harvester.metadata_merged)
# self.metadata_unmerged.extend(self.metadata_harvester.metadata_unmerged)
# self.metadata_merged.update(self.metadata_harvester.metadata_merged)
self.repeat_pid_check = self.metadata_harvester.repeat_pid_check
self.namespace_uri.extend(self.metadata_harvester.namespace_uri)
self.metadata_sources.extend(self.metadata_harvester.metadata_sources)
Expand All @@ -420,35 +420,9 @@ def retrieve_metadata_external(self, target_url=None, repeat_mode=False):
self.pid_scheme = self.metadata_harvester.pid_scheme
self.pid_collector.update(self.metadata_harvester.pid_collector)

"""def lookup_metadatastandard_by_name(self, value):
found = None
# get standard name with the highest matching percentage using fuzzywuzzy
highest = process.extractOne(value, FAIRCheck.COMMUNITY_METADATA_STANDARDS_NAMES, scorer=fuzz.token_sort_ratio)
if highest[1] > 80:
found = highest[2]
return found
def lookup_metadatastandard_by_uri(self, value):
found = None
if value:
value = str(value).strip().strip('#/')
# try to find it as direct match using http or https as prefix
if value.startswith('http') or value.startswith('ftp'):
value = value.replace('s://', '://')
found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value)
if not found:
found = FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.get(value.replace('://', 's://'))
if not found:
#fuzzy as fall back
try:
match = process.extractOne(value,
FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.keys())
if extract(str(value)).domain == extract(str(match[1]).domain):
if match[1] > 90:
found = list(FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS.values())[match[2]]
except Exception as e:
pass
return found"""
def set_harvested_metadata(self):
self.metadata_unmerged = self.metadata_harvester.metadata_unmerged
self.metadata_merged = self.metadata_harvester.metadata_merged

def check_unique_metadata_identifier(self):
unique_identifier_check = FAIREvaluatorUniqueIdentifierMetadata(self)
Expand All @@ -467,7 +441,6 @@ def check_persistent_data_identifier(self):
return persistent_identifier_check.getResult()

def check_unique_persistent_metadata_identifier(self):
# self.metadata_harvester.get_signposting_object_identifier()
return self.check_unique_metadata_identifier(), self.check_persistent_metadata_identifier()

def check_unique_persistent_software_identifier(self):
Expand Down
18 changes: 18 additions & 0 deletions fuji_server/controllers/fair_object_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from fuji_server.helper.preprocessor import Preprocessor
from fuji_server.helper.results_exporter import FAIRResultsMapper
from fuji_server.models.fair_results import FAIRResults
from fuji_server.models.harvest_results_metadata import HarvestResultsMetadata


async def assess_by_id(body):
Expand Down Expand Up @@ -80,9 +81,11 @@ async def assess_by_id(body):

print("starting harvesting ")
ft.harvest_all_metadata()
ft.set_harvested_metadata()
uid_result, pid_result = ft.check_unique_persistent_metadata_identifier()
if ft.repeat_pid_check:
ft.retrieve_metadata_external(ft.pid_url, repeat_mode=True)
ft.set_harvested_metadata()
ft.harvest_re3_data()
ft.harvest_github()
core_metadata_result = ft.check_minimal_metatadata()
Expand Down Expand Up @@ -210,6 +213,20 @@ async def assess_by_id(body):
idhelper = IdentifierHelper(ft.pid_url)
request["normalized_object_identifier"] = idhelper.get_normalized_id()
results.sort(key=lambda d: d["id"]) # sort results by metric ID
#### metadata summary
harvest_result = []
for metadata in ft.metadata_unmerged:
harvest_result.append(
HarvestResultsMetadata(
metadata.get("offering_method"),
metadata.get("url"),
metadata.get("format"),
metadata.get("schema"),
metadata.get("namespaces"),
metadata.get("metadata"),
)
)
###
final_response = FAIRResults(
request=request,
start_timestamp=starttimestmp,
Expand All @@ -222,6 +239,7 @@ async def assess_by_id(body):
results=results,
summary=summary,
resolved_url=resolved_url,
harvested_metadata=harvest_result,
)
accept_header = connexion.request.headers.get("Accept")
print("ACCEPT HEADER ", accept_header)
Expand Down

0 comments on commit ebd3de5

Please sign in to comment.