softwarepub · led02 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/docs/source/dev/data_model.md b/docs/source/dev/data_model.md
@@ -20,8 +20,69 @@ Instead, use {class}`hermes.model.context.HermesContext` and respective subclass
 ## Harvest Data
 
 The data of the havesters is cached in the sub-directory `.hermes/harvest`.
-Each harvester has a separate cache file to allow parallel harvesting.
-The cache file is encoded in JSON and stored in `.hermes/harvest/HARVESTER_NAME.json`
-where `HARVESTER_NAME` corresponds to the entry point name.
 
-{class}`hermes.model.context.HermesHarvestContext` encapsulates these harvester caches.
+
+## Data representation
+
+We are trying to be fully JSON-LD compliant. However, there are two special cases, where we are a bit more lazy:
+
+- Storing provenance of harvested data (for later curation)
+- Storing alternatives of harvested data (for later curation)
+
+Internally, `hermes` works with the expanded version of the JSON-LD file.
+However, there are helper classes that allow you to use compact references.
+
+For the storing of provenance and alternatives, we introduce our own terms.
+Especially, we add a `hermes:meta` key to the top level record.
+This top level key contains a list of additional meta-metadata (i.e., provenance and alternatives).
+
+Each entry in the meta-metadata list is a dictionary that contains at least a `reference` value and one or more of
+`provenance` and `alternative` keys.
+The `refrence` value should be a valid JSON Path that references the object that is subject to these metadata.
+The `provenance` value should be a JSON dataset that keeps information about where the data came from.
+The `alternative` value should be a list with alternative records.
+
+Each alternative record contains a `value` and probably an additional `provenance` key.
+
+Example:
+
+```json
+{
+	"@context": [
+		"https://doi.org/10.5063/schema/codemeta-2.0",
+		{"hermes": "https://schema.software-metadata.pub/hermes/1.0"},
+		{"legalName": {"@id": "schema:name"}}
+	],
+	"@type": "SoftwareSourceCode",
+	"author": [
+		{
+			"@id": "https://orcid.org/0000-0001-6372-3853",
+			"@type": "Person",
+			"affiliation": {
+				"@type": "Organization",
+				"legalName": "German Aerospace Center (DLR)"
+			},
+			"familyName": "Meinel",
+			"givenName": "Michael",
+			"email": "[email protected]"
+		}
+	],
+	"description": "Tool to automate software publication. Not stable yet.",
+	"identifier": "https://doi.org/10.5281/zenodo.13221384",
+	"license": "https://spdx.org/licenses/Apache-2.0",
+	"name": "hermes",
+	"version": "0.8.1",
+	"hermes:meta": [
+		{
+			"reference": "$",
+			"provenance": { "harvester": "cff", "source": "CITATION.cff" }
+		},
+		{
+			"reference": "$.author.0.affiliation.legalName",
+			"alternative": [
+				{"value": "DLR e.V.", "provenance": { "harvester": "orcid" }}
+			],
+		}
+	]
+}
+```
diff --git a/hermes.toml b/hermes.toml
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: CC0-1.0
 
 [harvest]
-sources = [ "cff", "toml" ] # ordered priority (first one is most important)
+sources = [ "cff", "toml", "git" ] # ordered priority (first one is most important)
 
 [deposit]
 target = "invenio_rdm"

diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py
@@ -6,13 +6,13 @@
 
 import argparse
 import typing as t
-from datetime import datetime
 
 from pydantic import BaseModel
 
 from hermes.commands.base import HermesCommand, HermesPlugin
-from hermes.model.context import HermesContext, HermesHarvestContext
+from hermes.model.context_manager import HermesContext
 from hermes.model.errors import HermesValidationError, MergeError
+from hermes.model.ld_utils import bundled_document_loader, jsonld_dict
 
 
 class HermesHarvestPlugin(HermesPlugin):
@@ -39,28 +39,40 @@ class HermesHarvestCommand(HermesCommand):
 
     def __call__(self, args: argparse.Namespace) -> None:
         self.args = args
-        ctx = HermesContext()
 
         # Initialize the harvest cache directory here to indicate the step ran
-        ctx.init_cache("harvest")
+        ctx = HermesContext()
+        ctx.prepare_step('harvest')
 
         for plugin_name in self.settings.sources:
             try:
+                # Load plugin and run the harvester
                 plugin_func = self.plugins[plugin_name]()
                 harvested_data, tags = plugin_func(self)
 
-                with HermesHarvestContext(ctx, plugin_name) as harvest_ctx:
-                    harvest_ctx.update_from(harvested_data,
-                                            plugin=plugin_name,
-                                            timestamp=datetime.now().isoformat(), **tags)
-                    for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
-                        if any(v != _value and t == _tag for v, t in _trace):
-                            raise MergeError(_key, None, _value)
+                # Ensure we have a jsonld_dict here to allow expansion
+                if not isinstance(harvested_data, jsonld_dict):
+                    harvested_data = jsonld_dict(**harvested_data)
+
+                # Transform the graph into a canoncial form
+                expanded_data, jsonld_context = harvested_data.expand()
+
+                with ctx[plugin_name] as plugin_ctx:
+                    plugin_ctx['data'] = harvested_data
+                    plugin_ctx['jsonld'] = expanded_data
+                    plugin_ctx['context'] = jsonld_context
+                    plugin_ctx['tags'] = tags
 
             except KeyError as e:
                 self.log.error("Plugin '%s' not found.", plugin_name)
                 self.errors.append(e)
 
+            #except HermesHarvestError as e:
+            #    self.log.error("Harvesting %s failed: %s", plugin_name, e)
+            #    self.errors.append(e)
+
             except HermesValidationError as e:
                 self.log.error("Error while executing %s: %s", plugin_name, e)
                 self.errors.append(e)
+
+        ctx.finalize_step('harvest')
diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
@@ -16,10 +16,10 @@
 import jsonschema
 from cffconvert import Citation
 
+from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
 from hermes.model.context import ContextPath
 from hermes.model.errors import HermesValidationError
-from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
-
+from hermes.model.ld_utils import jsonld_dict
 
 # TODO: should this be configurable via a CLI option?
 _CFF_VERSION = '1.2.0'
@@ -53,8 +53,8 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         # Convert to CodeMeta using cffconvert
         codemeta_dict = self._convert_cff_to_codemeta(cff_data)
         # TODO Replace the following temp patch for #112 once there is a new cffconvert version with cffconvert#309
-        codemeta_dict = self._patch_author_emails(cff_dict, codemeta_dict)
-
+        codemeta_dict = jsonld_dict(self._patch_author_emails(cff_dict, codemeta_dict))
+        codemeta_dict.add_context({'legalName': {'@id': "schema:name"}})
         return codemeta_dict, {'local_path': str(cff_file)}
 
     def _load_cff_from_file(self, cff_data: str) -> t.Any:

diff --git a/src/hermes/model/context_manager.py b/src/hermes/model/context_manager.py
@@ -0,0 +1,58 @@
+import json
+import os.path
+import pathlib
+
+
+class HermesCache:
+    def __init__(self, cache_dir: pathlib.Path):
+        self._cache_dir = cache_dir
+        self._cached_data = {}
+
+    def __enter__(self):
+        if self._cache_dir.is_dir():
+            for filepath in self._cache_dir.glob('*'):
+                basename, _ = os.path.splitext(filepath.name)
+                self._cached_data[basename] = json.load(filepath.open('r'))
+
+        return self
+
+    def __getitem__(self, item: str) -> dict:
+        if not item in self._cached_data:
+            filepath = self._cache_dir / f'{item}.json'
+            if filepath.is_file():
+                self._cached_data[item] = json.load(filepath.open('r'))
+
+        return self._cached_data[item]
+
+    def __setitem__(self, key: str, value: dict):
+        self._cached_data[key] = value
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            self._cache_dir.mkdir(exist_ok=True, parents=True)
+
+            for basename, data in self._cached_data.items():
+                cachefile = self._cache_dir / f'{basename}.json'
+                json.dump(data, cachefile.open('w'))
+
+
+class HermesContext:
+    CACHE_DIR_NAME = '.hermes'
+
+    def __init__(self, project_dir: pathlib.Path = pathlib.Path.cwd()):
+        self.project_dir = project_dir
+        self.cache_dir = project_dir / self.CACHE_DIR_NAME
+
+        self._current_step = []
+
+    def prepare_step(self, step: str, *depends: str) -> None:
+        self._current_step.append(step)
+
+    def finalize_step(self, step: str) -> None:
+        current_step = self._current_step.pop()
+        if current_step != step:
+            raise ValueError("Cannot end step %s while in %s.", step, self._current_step[-1])
+
+    def __getitem__(self, source_name: str) -> HermesCache:
+        subdir = self.cache_dir / self._current_step[-1] / source_name
+        return HermesCache(subdir)