Merge branch 'pangaea-data-publisher:master' into dev

pangaea-data-publisher · May 3, 2024 · 4c6c528 · 4c6c528
2 parents 8fc3dc6 + 9f419b8
commit 4c6c528
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 10 deletions.
diff --git a/.github/workflows/reports.yml b/.github/workflows/reports.yml
@@ -20,12 +20,12 @@ jobs:
       actions: read
     steps:
     - name: Download and Extract Artifacts
-      uses: dawidd6/action-download-artifact@e7466d1a7587ed14867642c2ca74b5bcc1e19a2d # v3.0.0
+      uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3.1.4
       with:
         run_id: ${{ github.event.workflow_run.id }}
         path: artifacts
     - name: Publish Test Results
-      uses: EnricoMi/publish-unit-test-result-action@e780361cd1fc1b1a170624547b3ffda64787d365 # v2.12.0
+      uses: EnricoMi/publish-unit-test-result-action@30eadd5010312f995f0d3b3cff7fe2984f69409e # v2.16.1
       with:
         comment_title: ':clipboard: Pytest Results'
         commit: ${{ github.event.workflow_run.head_sha }}
@@ -43,7 +43,7 @@ jobs:
       actions: read
     steps:
     - name: Download and Extract Artifacts
-      uses: dawidd6/action-download-artifact@e7466d1a7587ed14867642c2ca74b5bcc1e19a2d # v3.0.0
+      uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3.1.4
       with:
         run_id: ${{ github.event.workflow_run.id }}
         path: artifacts

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Thanks to [Heinz-Alexander Fuetterer](https://github.com/afuetterer) for his con
 [![Coverage](https://pangaea-data-publisher.github.io/fuji/coverage/coveragebadge.svg)](https://pangaea-data-publisher.github.io/fuji/coverage/)
 
 [![Publish Docker image](https://github.com/pangaea-data-publisher/fuji/actions/workflows/publish-docker.yml/badge.svg)](https://github.com/pangaea-data-publisher/fuji/actions/workflows/publish-docker.yml)
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4063720.svg)](https://doi.org/10.5281/zenodo.4063720)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11084909.svg)](https://doi.org/10.5281/zenodo.11084909)
 
 ## Overview
 
@@ -233,6 +233,11 @@ The multiple test methods within an evaluator also check whether their specific
 For each metric, the maturity is determined as the maximum of the maturity associated with each passed test.
 This means that if a test indicating maturity 3 is passed and one indicating maturity 2 is not passed, the metric will still be shown to be fulfilled with maturity 3.
 
+### Community specific metrics
+
+Some, not all, metrics can be configured using the following guidelines:
+[Metrics configuration guide](https://github.com/pangaea-data-publisher/fuji/blob/master/metrics_configuration.md)
+
 ### Updates to the API
 
 Making changes to the API requires re-generating parts of the code using Swagger.

diff --git a/fuji_server/evaluators/fair_evaluator_license.py b/fuji_server/evaluators/fair_evaluator_license.py
@@ -50,6 +50,7 @@ def setLicenseDataAndOutput(self):
             specified_licenses = [specified_licenses]
         if specified_licenses is not None and specified_licenses != []:
             for license in specified_licenses:
+                # print(license)
                 isurl = False
                 licence_valid = False
                 license_output = LicenseOutputInner()
@@ -58,8 +59,12 @@ def setLicenseDataAndOutput(self):
                     if isurl:
                         iscc, generic_cc = self.isCreativeCommonsLicense(license, self.metric_identifier)
                         if iscc:
-                            license = generic_cc
-                        spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_url(license, self.metric_identifier)
+                            spdx_osi = True
+                            spdx_uri = license
+                            spdx_id = generic_cc
+                            #    license = generic_cc
+                        else:
+                            spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_url(license, self.metric_identifier)
                     else:  # maybe licence name
                         spdx_uri, spdx_osi, spdx_id = self.lookup_license_by_name(license, self.metric_identifier)
                     license_output.license = license

diff --git a/fuji_server/evaluators/fair_evaluator_searchable.py b/fuji_server/evaluators/fair_evaluator_searchable.py
@@ -153,6 +153,11 @@ def testSearchEngineCompatibleMetadataAvailable(self):
                                 search_engine_support_match.append(
                                     standard_found + " via: " + found_metadata.get("offering_method")
                                 )
+                    else:
+                        self.logger.info(
+                            self.metric_identifier
+                            + "Found RDFa like metadata which however is empty thus useless for search engines"
+                        )
             search_engine_support_match = list(set(search_engine_support_match))
             # OLD WAY
             # Check search mechanisms based on sources of metadata extracted.

diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py
@@ -1015,7 +1015,7 @@ def get_dcat_metadata(self, graph):
             dcat_metadata = self.get_metadata(graph, datasets[0], type="Dataset")
             # distribution
             distribution = graph.objects(datasets[0], DCAT.distribution)
-
+            # do something (check for table headers) with the table here..
             for t in table:
                 print(t)
             dcat_metadata["object_content_identifier"] = []
@@ -1057,7 +1057,7 @@ def get_dcat_metadata(self, graph):
                             dist, DCTERMS.rights
                         )
                     dtype = graph.value(dist, DCAT.mediaType)
-                    dsize = graph.value(dist, DCAT.bytesSize)
+                    dsize = graph.value(dist, DCAT.byteSize)
                 if durl or dtype or dsize:
                     if idutils.is_url(str(durl)):
                         dtype = "/".join(str(dtype).split("/")[-2:])

diff --git a/metrics_configuration.md b/metrics_configuration.md
@@ -0,0 +1,51 @@
+# F-UJI configuration options
+
+Since version 3.0.0 F-UJI offers a variety of configuration options which allows to use user defined metrics and to restrict metadata harvesting methods.
+
+## Metric YAML
+
+You can define your own metric definitions in a dedicated YAML file. Metrics YAML files have to comply with the following conventions:
+
+* Files need to be located in folder 'yaml'
+* File names have to follow this syntax: metrics_[version][community_code].yaml
+	where [version] has to be a number  must be a number, which can optionally have one decimal point.
+
+By now, user define metrics have to be based on metrics file 'metrics_0.6.yaml' which should be used as template.
+
+Copy the YAML content of this metric file to a new metric file and save the file following the syntax mentioned above for the file name of the new metrics e.g. metrics_0.6new.yaml.
+
+To define own metrics you can restrict the number of metrics and add configuration options to a limited number of existing metrics.
+
+### Configure metrics and tests to be performed 
+
+To restrict metrics choose those you want to use from the 0.6 list of metrics and tests and simply delete tests or metrics which you do not wish to be performed during your assessments.
+
+### Configure individual metrics tests
+
+For all metrics and tests you can change the YAML properties*metric_short_name*, *metric_name* and *description* according to your needs.
+
+For some tests you can define additional parameters. For example, one can specify exactly which metadata elements, licenses, metadata standards or vocabularies are expected.
+
+Generally, these specifications are defined using the YAML property *community_requirements* which has to be a dictionary containing the subproperties *target*, *modality*, and  *required*. 
+
+* *target* defines the test targets, defined in the F-UJI ontology, such as licenses, metadata properties etc. which is represented by a controlled list of values which is used for tests by default.
+* *required* has to be a list which defines the necessary property values
+* *modality* defines if *all* or *any* of *required* values need to be present to pass the test. 
+*match* specifies how matching values are identified: *wildcard* for wildcard-like match rules like 'test*'; *full* when a full match is required. 
+*target_property* additionally defines the property of the *target* object in which matches are searched for, by default the property *name* or *label* is used for this purpose.
+*modality* and *match* are currently not yet implemented, thus still hardcoded :( but may be implemented in future versions.
+
+## Selectin a metric within an API call
+
+Within the POST data you need to specify the metric which has to be used. To do this, use the *metric_version* argument:
+~~~
+{
+  "object_identifier": "https://doi.org/10.1594/PANGAEA.908011",
+  "test_debug": true,
+  "metadata_service_endpoint": "http://ws.pangaea.de/oai/provider",
+  "metadata_service_type": "oai_pmh",
+  "use_datacite": true,
+  "use_github": false,
+  "metric_version": "metrics_v0.5"
+}
+~~~
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
   "hashid~=3.1.4",
   "idutils~=1.2",
   "jmespath~=1.0",
-  "levenshtein~=0.24.0",
+  "levenshtein~=0.25.0",
   "lxml==5.1.0",
   "pandas~=2.1",
   "pyRdfa3~=3.5",
@@ -83,7 +83,7 @@ report = [
 testing = [
   "genbadge[coverage]~=1.1",
   "pytest~=8.0",
-  "pytest-cov~=4.1",
+  "pytest-cov~=5.0",
   "pytest-randomly~=3.15",
   "pytest-recording~=0.13",
   "pytest-xdist~=3.3"