Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ha Ha Hack-A-Thon! 🐱‍💻 #288

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 65 additions & 4 deletions docs/source/dev/data_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,69 @@ Instead, use {class}`hermes.model.context.HermesContext` and respective subclass
## Harvest Data

The data of the havesters is cached in the sub-directory `.hermes/harvest`.
Each harvester has a separate cache file to allow parallel harvesting.
The cache file is encoded in JSON and stored in `.hermes/harvest/HARVESTER_NAME.json`
where `HARVESTER_NAME` corresponds to the entry point name.

{class}`hermes.model.context.HermesHarvestContext` encapsulates these harvester caches.

## Data representation

We are trying to be fully JSON-LD compliant. However, there are two special cases, where we are a bit more lazy:

- Storing provenance of harvested data (for later curation)
- Storing alternatives of harvested data (for later curation)

Internally, `hermes` works with the expanded version of the JSON-LD file.
However, there are helper classes that allow you to use compact references.

For the storing of provenance and alternatives, we introduce our own terms.
Especially, we add a `hermes:meta` key to the top level record.
This top level key contains a list of additional meta-metadata (i.e., provenance and alternatives).

Each entry in the meta-metadata list is a dictionary that contains at least a `reference` value and one or more of
`provenance` and `alternative` keys.
The `refrence` value should be a valid JSON Path that references the object that is subject to these metadata.
The `provenance` value should be a JSON dataset that keeps information about where the data came from.
The `alternative` value should be a list with alternative records.

Each alternative record contains a `value` and probably an additional `provenance` key.

Example:

```json
{
"@context": [
"https://doi.org/10.5063/schema/codemeta-2.0",
{"hermes": "https://schema.software-metadata.pub/hermes/1.0"},
{"legalName": {"@id": "schema:name"}}
],
"@type": "SoftwareSourceCode",
"author": [
{
"@id": "https://orcid.org/0000-0001-6372-3853",
"@type": "Person",
"affiliation": {
"@type": "Organization",
"legalName": "German Aerospace Center (DLR)"
},
"familyName": "Meinel",
"givenName": "Michael",
"email": "[email protected]"
}
],
"description": "Tool to automate software publication. Not stable yet.",
"identifier": "https://doi.org/10.5281/zenodo.13221384",
"license": "https://spdx.org/licenses/Apache-2.0",
"name": "hermes",
"version": "0.8.1",
"hermes:meta": [
{
"reference": "$",
"provenance": { "harvester": "cff", "source": "CITATION.cff" }
},
{
"reference": "$.author.0.affiliation.legalName",
"alternative": [
{"value": "DLR e.V.", "provenance": { "harvester": "orcid" }}
],
}
]
}
```
2 changes: 1 addition & 1 deletion hermes.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: CC0-1.0

[harvest]
sources = [ "cff", "toml" ] # ordered priority (first one is most important)
sources = [ "cff", "toml", "git" ] # ordered priority (first one is most important)

[deposit]
target = "invenio_rdm"
Expand Down
34 changes: 23 additions & 11 deletions src/hermes/commands/harvest/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

import argparse
import typing as t
from datetime import datetime

from pydantic import BaseModel

from hermes.commands.base import HermesCommand, HermesPlugin
from hermes.model.context import HermesContext, HermesHarvestContext
from hermes.model.context_manager import HermesContext
from hermes.model.errors import HermesValidationError, MergeError
from hermes.model.ld_utils import bundled_document_loader, jsonld_dict


class HermesHarvestPlugin(HermesPlugin):
Expand All @@ -39,28 +39,40 @@ class HermesHarvestCommand(HermesCommand):

def __call__(self, args: argparse.Namespace) -> None:
self.args = args
ctx = HermesContext()

# Initialize the harvest cache directory here to indicate the step ran
ctx.init_cache("harvest")
ctx = HermesContext()
ctx.prepare_step('harvest')

for plugin_name in self.settings.sources:
try:
# Load plugin and run the harvester
plugin_func = self.plugins[plugin_name]()
harvested_data, tags = plugin_func(self)

with HermesHarvestContext(ctx, plugin_name) as harvest_ctx:
harvest_ctx.update_from(harvested_data,
plugin=plugin_name,
timestamp=datetime.now().isoformat(), **tags)
for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
if any(v != _value and t == _tag for v, t in _trace):
raise MergeError(_key, None, _value)
# Ensure we have a jsonld_dict here to allow expansion
if not isinstance(harvested_data, jsonld_dict):
harvested_data = jsonld_dict(**harvested_data)

# Transform the graph into a canoncial form
expanded_data, jsonld_context = harvested_data.expand()

with ctx[plugin_name] as plugin_ctx:
plugin_ctx['data'] = harvested_data
plugin_ctx['jsonld'] = expanded_data
plugin_ctx['context'] = jsonld_context
plugin_ctx['tags'] = tags

except KeyError as e:
self.log.error("Plugin '%s' not found.", plugin_name)
self.errors.append(e)

#except HermesHarvestError as e:
# self.log.error("Harvesting %s failed: %s", plugin_name, e)
# self.errors.append(e)

except HermesValidationError as e:
self.log.error("Error while executing %s: %s", plugin_name, e)
self.errors.append(e)

ctx.finalize_step('harvest')
8 changes: 4 additions & 4 deletions src/hermes/commands/harvest/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
import jsonschema
from cffconvert import Citation

from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
from hermes.model.context import ContextPath
from hermes.model.errors import HermesValidationError
from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand

from hermes.model.ld_utils import jsonld_dict

# TODO: should this be configurable via a CLI option?
_CFF_VERSION = '1.2.0'
Expand Down Expand Up @@ -53,8 +53,8 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
# Convert to CodeMeta using cffconvert
codemeta_dict = self._convert_cff_to_codemeta(cff_data)
# TODO Replace the following temp patch for #112 once there is a new cffconvert version with cffconvert#309
codemeta_dict = self._patch_author_emails(cff_dict, codemeta_dict)

codemeta_dict = jsonld_dict(self._patch_author_emails(cff_dict, codemeta_dict))
codemeta_dict.add_context({'legalName': {'@id': "schema:name"}})
return codemeta_dict, {'local_path': str(cff_file)}

def _load_cff_from_file(self, cff_data: str) -> t.Any:
Expand Down
58 changes: 58 additions & 0 deletions src/hermes/model/context_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
import os.path
import pathlib


class HermesCache:
def __init__(self, cache_dir: pathlib.Path):
self._cache_dir = cache_dir
self._cached_data = {}

def __enter__(self):
if self._cache_dir.is_dir():
for filepath in self._cache_dir.glob('*'):
basename, _ = os.path.splitext(filepath.name)
self._cached_data[basename] = json.load(filepath.open('r'))

return self

def __getitem__(self, item: str) -> dict:
if not item in self._cached_data:
filepath = self._cache_dir / f'{item}.json'
if filepath.is_file():
self._cached_data[item] = json.load(filepath.open('r'))

return self._cached_data[item]

def __setitem__(self, key: str, value: dict):
self._cached_data[key] = value

def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is None:
self._cache_dir.mkdir(exist_ok=True, parents=True)

for basename, data in self._cached_data.items():
cachefile = self._cache_dir / f'{basename}.json'
json.dump(data, cachefile.open('w'))


class HermesContext:
CACHE_DIR_NAME = '.hermes'

def __init__(self, project_dir: pathlib.Path = pathlib.Path.cwd()):
self.project_dir = project_dir
self.cache_dir = project_dir / self.CACHE_DIR_NAME

self._current_step = []

def prepare_step(self, step: str, *depends: str) -> None:
self._current_step.append(step)

def finalize_step(self, step: str) -> None:
current_step = self._current_step.pop()
if current_step != step:
raise ValueError("Cannot end step %s while in %s.", step, self._current_step[-1])

def __getitem__(self, source_name: str) -> HermesCache:
subdir = self.cache_dir / self._current_step[-1] / source_name
return HermesCache(subdir)
Loading
Loading