diff --git a/.gitignore b/.gitignore index a009431a..d7c36ebe 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ docs/build/ .idea/ .venv/ +.vscode/ dist/ # HERMES workflow specifics diff --git a/docs/source/conf.py b/docs/source/conf.py index 84f5517e..88a923a9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,7 +28,9 @@ project = 'HERMES Workflow' copyright = '2022, HERMES project' -author = 'Oliver Bertuch, Stephan Druskat, Guido Juckeland, Jeffrey Kelling, Oliver Knodel, Michael Meinel, Tobias Schlauch' +author = 'Oliver Bertuch, Stephan Druskat, Guido Juckeland, Jeffrey Kelling, ' + \ + 'Oliver Knodel, Michael Meinel, Tobias Schlauch, Sophie Kernchen' + # The full version, including alpha/beta/rc tags release = '2022-07-01' diff --git a/hermes.toml b/hermes.toml index f6831be5..999179a2 100644 --- a/hermes.toml +++ b/hermes.toml @@ -13,7 +13,7 @@ target = "invenio_rdm" [deposit.invenio_rdm] site_url = "https://sandbox.zenodo.org" -communities = ["zenodo"] +communities = [] access_right = "open" [deposit.invenio_rdm.api_paths] diff --git a/pyproject.toml b/pyproject.toml index cc8f7b67..7c26c22a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,45 +86,10 @@ git = "hermes.commands.process.git:process" git_add_contributors = "hermes.commands.process.git:add_contributors" git_add_branch = "hermes.commands.process.git:add_branch" -[tool.poetry.plugins."hermes.deposit.prepare"] -invenio = "hermes.commands.deposit.invenio:prepare" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:prepare" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.map"] -invenio = "hermes.commands.deposit.invenio:map_metadata" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:map_metadata" -file = "hermes.commands.deposit.file:map_metadata" - -[tool.poetry.plugins."hermes.deposit.create_initial_version"] -invenio = "hermes.commands.deposit.invenio:create_initial_version" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_initial_version" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.create_new_version"] -invenio = "hermes.commands.deposit.invenio:create_new_version" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_new_version" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.update_metadata"] -invenio = "hermes.commands.deposit.invenio:update_metadata" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:update_metadata" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.delete_artifacts"] -invenio = "hermes.commands.deposit.invenio:delete_artifacts" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:delete_artifacts" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.upload_artifacts"] -invenio = "hermes.commands.deposit.invenio:upload_artifacts" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:upload_artifacts" -file = "hermes.commands.deposit.file:dummy_noop" - -[tool.poetry.plugins."hermes.deposit.publish"] -invenio = "hermes.commands.deposit.invenio:publish" -invenio_rdm = "hermes.commands.deposit.invenio_rdm:publish" -file = "hermes.commands.deposit.file:publish" +[tool.poetry.plugins."hermes.deposit"] +file = "hermes.commands.deposit.file:FileDepositPlugin" +invenio = "hermes.commands.deposit.invenio:InvenioDepositPlugin" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:IvenioRDMDepositPlugin" [tool.poetry.plugins."hermes.postprocess"] config_invenio_record_id = "hermes.commands.postprocess.invenio:config_record_id" diff --git a/src/hermes/commands/deposit/base.py b/src/hermes/commands/deposit/base.py new file mode 100644 index 00000000..81d41f52 --- /dev/null +++ b/src/hermes/commands/deposit/base.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: 2023 Helmholtz-Zentrum Dresden-Rossendorf (HZDR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: David Pape +# SPDX-FileContributor: Michael Meinel + +import abc + +import click + +from hermes.model.context import CodeMetaContext + + +class BaseDepositPlugin(abc.ABC): + def __init__(self, click_ctx: click.Context, ctx: CodeMetaContext) -> None: + self.click_ctx = click_ctx + self.ctx = ctx + + def __call__(self) -> None: + """Initiate the deposition process. + + This calls a list of additional methods on the class, none of which need to be implemented. + """ + self.prepare() + self.map_metadata() + + if self.is_initial_publication(): + self.create_initial_version() + else: + self.create_new_version() + + self.update_metadata() + self.delete_artifacts() + self.upload_artifacts() + self.publish() + + def prepare(self) -> None: + """Prepare the deposition. + + This method may be implemented to check whether config and context match some initial conditions. + + If no exceptions are raised, execution continues. + """ + pass + + @abc.abstractmethod + def map_metadata(self) -> None: + """Map the given metadata to the target schema of the deposition platform.""" + pass + + def is_initial_publication(self) -> bool: + """Decide whether to do an initial publication or publish a new version. + + Returning ``True`` indicates that publication of an initial version will be executed, resulting in a call of + :meth:`create_initial_version`. ``False`` indicates a new version of an existing publication, leading to a call + of :meth:`create_new_version`. + + By default, this returns ``True``. + """ + return True + + @abc.abstractmethod + def create_initial_version(self) -> None: + """Create an initial version of the publication on the target platform.""" + pass + + @abc.abstractmethod + def create_new_version(self) -> None: + """Create a new version of an existing publication on the target platform.""" + pass + + @abc.abstractmethod + def update_metadata(self) -> None: + """Update the metadata of the newly created version.""" + pass + + def delete_artifacts(self) -> None: + """Delete any superfluous artifacts taken from the previous version of the publication.""" + pass + + def upload_artifacts(self) -> None: + """Upload new artifacts to the target platform.""" + pass + + @abc.abstractmethod + def publish(self) -> None: + """Publish the newly created deposit on the target platform.""" + pass diff --git a/src/hermes/commands/deposit/file.py b/src/hermes/commands/deposit/file.py index 0878cbf4..ac26e7c9 100644 --- a/src/hermes/commands/deposit/file.py +++ b/src/hermes/commands/deposit/file.py @@ -7,24 +7,18 @@ import json -import click - from hermes import config -from hermes.model.context import CodeMetaContext +from hermes.commands.deposit.base import BaseDepositPlugin from hermes.model.path import ContextPath -def dummy_noop(click_ctx: click.Context, ctx: CodeMetaContext): - pass - - -def map_metadata(click_ctx: click.Context, ctx: CodeMetaContext): - ctx.update(ContextPath.parse('deposit.file'), ctx['codemeta']) - +class FileDepositPlugin(BaseDepositPlugin): + def map_metadata(self) -> None: + self.ctx.update(ContextPath.parse('deposit.file'), self.ctx['codemeta']) -def publish(click_ctx: click.Context, ctx: CodeMetaContext): - file_config = config.get("deposit").get("file", {}) - output_data = ctx['deposit.file'] + def publish(self) -> None: + file_config = config.get("deposit").get("file", {}) + output_data = self.ctx['deposit.file'] - with open(file_config.get('filename', 'hermes.json'), 'w') as deposition_file: - json.dump(output_data, deposition_file) + with open(file_config.get('filename', 'hermes.json'), 'w') as deposition_file: + json.dump(output_data, deposition_file, indent=2) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index bbc4ab28..ecd6c337 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -17,648 +17,630 @@ import requests from hermes import config +from hermes.commands.deposit.base import BaseDepositPlugin from hermes.commands.deposit.error import DepositionUnauthorizedError from hermes.error import MisconfigurationError from hermes.model.context import CodeMetaContext from hermes.model.path import ContextPath from hermes.utils import hermes_user_agent -_DEFAULT_LICENSES_API_PATH = "api/licenses" -_DEFAULT_COMMUNITIES_API_PATH = "api/communities" -_DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" +_log = logging.getLogger("cli.deposit.invenio") -def prepare(click_ctx: click.Context, ctx: CodeMetaContext): - """Prepare the deposition on an Invenio-based platform. - In this function we do the following: +class InvenioClient(requests.Session): + DEFAULT_LICENSES_API_PATH = "api/licenses" + DEFAULT_COMMUNITIES_API_PATH = "api/communities" + DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" + DEFAULT_RECORDS_API_PATH = "api/records" - - resolve the latest published version of this publication (if any) - - check whether the current version (given in the CodeMeta) was already published - - check whether we have a valid license identifier (if any) - - check wether the communities are valid (if configured) - - check access modalities (access right, access conditions, embargo data, existence - of license) - - check whether required configuration options are present - - check whether an auth token is given - - update ``ctx`` with metadata collected during the checks - """ + # Used for context path and config + platform_name = "invenio" - if not click_ctx.params["auth_token"]: - raise DepositionUnauthorizedError("No auth token given for deposition platform") + def __init__(self, auth_token=None, platform_name=None) -> None: + super().__init__() - invenio_path = ContextPath.parse("deposit.invenio") - invenio_config = config.get("deposit").get("invenio", {}) - rec_id, rec_meta = _resolve_latest_invenio_id(ctx) + if platform_name is not None: + self.platform_name = platform_name - version = ctx["codemeta"].get("version") - if rec_meta and (version == rec_meta.get("version")): - raise ValueError(f"Version {version} already deposited.") + self.config = config.get("deposit").get(self.platform_name, {}) + self.headers.update({"User-Agent": hermes_user_agent}) - ctx.update(invenio_path['latestRecord'], {'id': rec_id, 'metadata': rec_meta}) + self.auth_token = auth_token + self.site_url = self.config.get("site_url") + if self.site_url is None: + raise MisconfigurationError(f"deposit.{self.platform_name}.site_url is not configured") - site_url = invenio_config.get("site_url") - if site_url is None: - raise MisconfigurationError("deposit.invenio.site_url is not configured") + def request(self, method, url, headers=None, **kwargs) -> requests.Response: + """Overridden request method to automatically set Authorization header for all requests to the configured site. - licenses_api_path = invenio_config.get("api_paths", {}).get( - "licenses", _DEFAULT_LICENSES_API_PATH - ) - licenses_api_url = f"{site_url}/{licenses_api_path}" - license = _get_license_identifier(ctx, licenses_api_url) - ctx.update(invenio_path["license"], license) + See [requests documentation](https://requests.readthedocs.io/en/latest/api.html#requests.request) for details. + """ - communities_api_path = invenio_config.get("api_paths", {}).get( - "communities", _DEFAULT_COMMUNITIES_API_PATH - ) - communities_api_url = f"{site_url}/{communities_api_path}" - communities = _get_community_identifiers(ctx, communities_api_url) - ctx.update(invenio_path["communities"], communities) + if self.auth_token: + if urlparse(self.site_url).hostname == urlparse(url).hostname: + headers = (headers or {}) | {"Authorization": f"Bearer {self.auth_token}"} + return super().request(method, url, headers=headers, **kwargs) - access_right, embargo_date, access_conditions = _get_access_modalities(license) - ctx.update(invenio_path["access_right"], access_right) - ctx.update(invenio_path["embargo_date"], embargo_date) - ctx.update(invenio_path["access_conditions"], access_conditions) + def get_record(self, record_id): + return self.get(f"{self.site_url}/{self.records_api_path}/{record_id}") + def get_deposit(self, latest_record_id): + return self.get( + f"{self.site_url}/{self.depositions_api_path}/{latest_record_id}" + ) -def map_metadata(click_ctx: click.Context, ctx: CodeMetaContext): - """Map the harvested metadata onto the Invenio schema.""" + def get_license(self, license_id): + return self.get(f"{self.site_url}/{self.licenses_api_path}/{license_id}") - deposition_metadata = _codemeta_to_invenio_deposition(ctx) + def get_community(self, community_id): + return self.get(f"{self.site_url}/{self.communities_api_path}/{community_id}") - metadata_path = ContextPath.parse("deposit.invenio.depositionMetadata") - ctx.update(metadata_path, deposition_metadata) + def new_deposit(self): + return self.post(f"{self.site_url}/{self.depositions_api_path}", json={}) - # Store a snapshot of the mapped data within the cache, useful for analysis, debugging, etc - with open(ctx.get_cache("deposit", "invenio", create=True), 'w') as invenio_json: - json.dump(deposition_metadata, invenio_json, indent=' ') + @property + def api_paths(self): + return self.config.get("api_paths", {}) + @property + def licenses_api_path(self): + return self.api_paths.get("licenses", self.DEFAULT_LICENSES_API_PATH) -def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): - """Create an initial version of a publication. + @property + def communities_api_path(self): + return self.api_paths.get("communities", self.DEFAULT_COMMUNITIES_API_PATH) - If a previous publication exists, this function does nothing, leaving the work for - :func:`create_new_version`. - """ + @property + def depositions_api_path(self): + return self.api_paths.get("depositions", self.DEFAULT_DEPOSITIONS_API_PATH) - invenio_path = ContextPath.parse("deposit.invenio") - invenio_ctx = ctx[invenio_path] - latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + @property + def records_api_path(self): + return self.api_paths.get("records", self.DEFAULT_RECORDS_API_PATH) - if latest_record_id is not None: - # A previous version exists. This means that we need to create a new version in - # the next step. Thus, there is nothing to do here. - return - if not click_ctx.params['initial']: - raise RuntimeError("Please use `--initial` to make an initial deposition.") +class InvenioResolver: - _log = logging.getLogger("cli.deposit.invenio") + invenio_client_class = InvenioClient - invenio_config = config.get("deposit").get("invenio", {}) - site_url = invenio_config["site_url"] - depositions_api_path = invenio_config.get("api_paths", {}).get( - "depositions", _DEFAULT_DEPOSITIONS_API_PATH - ) + def __init__(self, client=None): + self.client = client or self.invenio_client_class() - deposition_metadata = invenio_ctx["depositionMetadata"] + def resolve_latest_id( + self, record_id=None, doi=None, codemeta_identifier=None + ) -> t.Tuple[t.Optional[str], dict]: + """ + Using the given metadata parameters, figure out the latest record id. - deposit_url = f"{site_url}/{depositions_api_path}" - response = requests.post( - deposit_url, - json={"metadata": deposition_metadata}, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - ) + If ``record_id`` is given, it will be used to identify the latest version of the + record. Otherwise, if there is a DOI present (either as ``doi`` parameter or as + ``codemeta_identifier``), the DOI will be used to resolve the base record id. - if not response.ok: - raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") + Either way the record id will be used to resolve the latest version. - deposit = response.json() - _log.debug("Created initial version deposit: %s", deposit["links"]["html"]) - with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: - json.dump(deposit, deposit_file, indent=4) + If any of the resolution steps fail or produce an unexpected result, a + ``ValueError`` will be raised. + """ - ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) - ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + # Check if we configured an Invenio record ID (of the concept...) + if record_id is None: + if doi is None: + if codemeta_identifier is not None: + # TODO: There might be more semantic in the codemeta.identifier... (also see schema.org) + if codemeta_identifier.startswith('https://doi.org/'): + doi = codemeta_identifier[16:] + elif codemeta_identifier.startswith('http://dx.doi.org/'): + doi = codemeta_identifier[18:] -def create_new_version(click_ctx: click.Context, ctx: CodeMetaContext): - """Create a new version of an existing publication. + if doi is not None: + # If we got a DOI, resolve it (using doi.org) into a Invenio URL ... and extract the record id. + record_id = self.resolve_doi(doi) - If no previous publication exists, this function does nothing because - :func:`create_initial_version` will have done the work. - """ + if record_id is not None: + # If we got a record id by now, resolve it using the Invenio API to the latests record. + return self.resolve_record_id(record_id) - invenio_path = ContextPath.parse("deposit.invenio") - invenio_ctx = ctx[invenio_path] - latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + return None, {} - if latest_record_id is None: - # No previous version exists. This means that an initial version was created in - # the previous step. Thus, there is nothing to do here. - return + def resolve_doi(self, doi) -> str: + """ + Resolve a DOI to an Invenio URL and extract the record id. - session = requests.Session() - session.headers = { - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } + :param doi: The DOI to be resolved (only the identifier *without* the ``https://doi.org/`` prefix). + :return: The record ID on the respective instance. + """ - invenio_config = config.get("deposit").get("invenio", {}) - site_url = invenio_config["site_url"] - depositions_api_path = invenio_config.get("api_paths", {}).get( - "depositions", _DEFAULT_DEPOSITIONS_API_PATH - ) + res = self.client.get(f'https://doi.org/{doi}') - # Get current deposit - deposit_url = f"{site_url}/{depositions_api_path}/{latest_record_id}" - response = session.get(deposit_url) - if not response.ok: - raise RuntimeError(f"Failed to get current deposit {deposit_url!r}") + # This is a mean hack due to DataCite answering a 404 with a 200 status + if res.url == 'https://datacite.org/404.html': + raise ValueError(f"Invalid DOI: {doi}") - # Create a new version using the newversion action - deposit_url = response.json()["links"]["newversion"] - response = session.post(deposit_url) - if not response.ok: - raise RuntimeError(f"Could not create new version deposit {deposit_url!r}") + # Ensure the resolved record is on the correct instance + if not res.url.startswith(self.client.site_url): + raise ValueError(f"{res.url} is not on configured host {self.client.site_url}.") - # Store link to latest draft to be used in :func:`update_metadata`. - old_deposit = response.json() - ctx.update(invenio_path["links"]["latestDraft"], old_deposit['links']['latest_draft']) + # Extract the record id as last part of the URL path + page_url = urlparse(res.url) + *_, record_id = page_url.path.split('/') + return record_id + def resolve_record_id(self, record_id: str) -> t.Tuple[str, dict]: + """ + Find the latest version of a given record. -def update_metadata(click_ctx: click.Context, ctx: CodeMetaContext): - """Update the metadata of a draft. + :param record_id: The record that sould be resolved. + :return: The record id of the latest version for the requested record. + """ + res = self.client.get_record(record_id) + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - If no draft is found in the context, it is assumed that no metadata has to be - updated (e.g. because an initial version was created already containing the - metadata). - """ - - invenio_path = ContextPath.parse("deposit.invenio") - invenio_ctx = ctx[invenio_path] - draft_url = invenio_ctx.get("links", {}).get("latestDraft") - - if draft_url is None: - return + res_json = res.json() + res = self.client.get(res_json['links']['latest']) + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - _log = logging.getLogger("cli.deposit.invenio") - - deposition_metadata = invenio_ctx["depositionMetadata"] + res_json = res.json() + return res_json['id'], res_json['metadata'] - response = requests.put( - draft_url, - json={"metadata": deposition_metadata}, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - ) - - if not response.ok: - raise RuntimeError(f"Could not update metadata of draft {draft_url!r}") + def resolve_license_id(self, license_url: t.Optional[str]) -> t.Optional[str]: + """Get Invenio license representation from CodeMeta. - deposit = response.json() - _log.debug("Created new version deposit: %s", deposit["links"]["html"]) - with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: - json.dump(deposit, deposit_file, indent=4) - - ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) - ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + The license to use is extracted from the ``license`` field in the + :class:`CodeMetaContext` and converted into an appropiate license identifier to be + passed to an Invenio instance. + A license according to CodeMeta may be a URL (text) or a CreativeWork. This function + only handles URLs. If a ``license`` field is present in the CodeMeta and it is not + of type :class:`str`, a :class:`RuntimeError` is raised. -def delete_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): - """Delete existing file artifacts. + Invenio instances take a license string which refers to a license identifier. + Typically, Invenio instances offer licenses from https://opendefinition.org and + https://spdx.org. However, it is possible to mint PIDs for custom licenses. - This is done so that files which existed in an earlier publication but don't exist - any more, are removed. Otherwise they would cause an error because the didn't change - between versions. - """ - # TODO: This needs to be implemented! - pass + An API endpoint (usually ``/api/licenses``) can be used to check whether a given + license is supported by the Invenio instance. This function tries to retrieve the + license by the identifier at the end of the license URL path. If this identifier + does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no + license is given in the CodeMeta, ``None`` is returned. + """ + + if license_url is None: + return None + + if not isinstance(license_url, str): + raise RuntimeError( + "The given license in CodeMeta must be of type str. " + "Licenses of type 'CreativeWork' are not supported." + ) + parsed_url = urlparse(license_url) + url_path = parsed_url.path.rstrip("/") + license_id = url_path.split("/")[-1] -def upload_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): - """Upload file artifacts to the deposit. + response = self.client.get_license(license_id) + if response.status_code == 404: + raise RuntimeError(f"Not a valid license identifier: {license_id}") + # Catch other problems + response.raise_for_status() - We'll use the bucket API rather than the files API as it supports file sizes above - 100MB. The URL to the bucket of the deposit is taken from the context at - ``deposit.invenio.links.bucket``. - """ - - bucket_url_path = ContextPath.parse("deposit.invenio.links.bucket") - bucket_url = ctx[bucket_url_path] + return response.json()["id"] - session = requests.Session() - session.headers = { - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - - files: list[click.Path] = click_ctx.params["file"] - for path_arg in files: - path = Path(path_arg) - - # This should not happen, as Click shall not accept dirs as arguments already. Zero trust anyway. - if not path.is_file(): - raise ValueError("Any given argument to be included in the deposit must be a file.") - with open(path, "rb") as file_content: - response = session.put( - f"{bucket_url}/{path.name}", - data=file_content - ) - if not response.ok: - raise RuntimeError(f"Could not upload file {path.name!r} into bucket {bucket_url!r}") +class InvenioDepositPlugin(BaseDepositPlugin): - # This can potentially be used to verify the checksum - # file_resource = response.json() + platform_name = "invenio" + invenio_client_class = InvenioClient + invenio_resolver_class = InvenioResolver + def __init__(self, click_ctx: click.Context, ctx: CodeMetaContext, client=None, resolver=None) -> None: + super().__init__(click_ctx, ctx) -def publish(click_ctx: click.Context, ctx: CodeMetaContext): - """Publish the deposited record. + self.invenio_context_path = ContextPath.parse(f"deposit.{self.platform_name}") + self.invenio_ctx = None - This is done by doing a POST request to the publication URL stored in the context at - ``deposit.invenio.links.publish``. - """ + if client is None: + auth_token = self.click_ctx.params.get("auth_token") + if auth_token is None: + raise DepositionUnauthorizedError("No auth token given for deposition platform") + self.client = self.invenio_client_class(auth_token=auth_token, platform_name=self.platform_name) + else: + self.client = client - _log = logging.getLogger("cli.deposit.invenio") + self.resolver = resolver or self.invenio_resolver_class(self.client) + self.config = config.get("deposit").get(self.platform_name, {}) + self.links = {} - publish_url_path = ContextPath.parse("deposit.invenio.links.publish") - publish_url = ctx[publish_url_path] + # TODO: Populate some data structure here? Or move more of this into __init__? + def prepare(self) -> None: + """Prepare the deposition on an Invenio-based platform. - response = requests.post( - publish_url, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}" - } - ) + In this function we do the following: - if not response.ok: - _log.debug(response.text) - raise RuntimeError(f"Could not publish deposit via {publish_url!r}") + - resolve the latest published version of this publication (if any) + - check whether the current version (given in the CodeMeta) was already published + - check whether we have a valid license identifier (if any) + - check wether the communities are valid (if configured) + - check access modalities (access right, access conditions, embargo data, existence + of license) + - check whether required configuration options are present + - update ``self.ctx`` with metadata collected during the checks + """ - record = response.json() - _log.info("Published record: %s", record["links"]["record_html"]) + rec_id = self.config.get('record_id') + doi = self.config.get('doi') + try: + codemeta_identifier = self.ctx["codemeta.identifier"] + except KeyError: + codemeta_identifier = None -def _resolve_latest_invenio_id(ctx: CodeMetaContext) -> t.Tuple[str, dict]: - """ - Using the given configuration and metadata, figure out the latest record id. + rec_id, rec_meta = self.resolver.resolve_latest_id( + record_id=rec_id, doi=doi, codemeta_identifier=codemeta_identifier + ) - If a record id is present as configuration ``deposit.invenio.record_id`` this one will be used to identify the - latest version of the record. Otherwise, if there is a doi present (either as configuration with key - ``deposit.invenio.doi`` or as a codemeta identifier, the DOI will be used to resolve the base record id. + version = self.ctx["codemeta"].get("version") + if rec_meta and (version == rec_meta.get("version")): + raise ValueError(f"Version {version} already deposited.") - Anyway, the record id will always be used to resolve the latest version. + self.ctx.update(self.invenio_context_path['latestRecord'], {'id': rec_id, 'metadata': rec_meta}) - If any of the resolution steps fail or produce an unexpected result, a ValueError will be thrown. + license = self._get_license_identifier() + self.ctx.update(self.invenio_context_path["license"], license) - :param ctx: The context for which the record id should be resolved. - :return: The Invenio record id and the metadata of the record - """ + communities = self._get_community_identifiers() + self.ctx.update(self.invenio_context_path["communities"], communities) - invenio_config = config.get('deposit').get('invenio', {}) - site_url = invenio_config.get('site_url') - if site_url is None: - raise MisconfigurationError("deposit.invenio.site_url is not configured") + access_right, embargo_date, access_conditions = self._get_access_modalities(license) + self.ctx.update(self.invenio_context_path["access_right"], access_right) + self.ctx.update(self.invenio_context_path["embargo_date"], embargo_date) + self.ctx.update(self.invenio_context_path["access_conditions"], access_conditions) - # Check if we configured an Invenio record ID (of the concept...) - record_id = invenio_config.get('record_id') - if record_id is None: - doi = invenio_config.get('doi') - if doi is None: - try: - # TODO: There might be more semantic in the codemeta.identifier... (also see schema.org) - identifier = ctx['codemeta.identifier'] - if identifier.startswith('https://doi.org/'): - doi = identifier[16:] - elif identifier.startswith('http://dx.doi.org/'): - doi = identifier[18:] - except KeyError: - pass - - if doi is not None: - # If we got a DOI, resolve it (using doi.org) into a Invenio URL ... and extract the record id. - record_id = _invenio_resolve_doi(site_url, doi) - - if record_id is not None: - # If we got a record id by now, resolve it using the Invenio API to the latests record. - return _invenio_resolve_record_id(site_url, record_id) - - return None, {} - - -def _invenio_resolve_doi(site_url, doi) -> str: - """ - Resolve an DOI to a Invenio URL and extract the record id. - - :param site_url: Root URL for the Invenio instance to use. - :param doi: The DOI to be resolved (only the identifier *without* the ``https://doi.org/`` prefix). - :return: The record ID on the respective instance. - """ - - res = requests.get(f'https://doi.org/{doi}') - - # This is a mean hack due to DataCite answering a 404 with a 200 status - if res.url == 'https://datacite.org/404.html': - raise ValueError(f"Invalid DOI: {doi}") - - # Ensure the resolved record is on the correct instance - if not res.url.startswith(site_url): - raise ValueError(f"{res.url} is not on configured host {site_url}.") - - # Extract the record id as last part of the URL path - page_url = urlparse(res.url) - *_, record_id = page_url.path.split('/') - return record_id - - -def _invenio_resolve_record_id(site_url: str, record_id: str) -> t.Tuple[str, dict]: - """ - Find the latest version of a given record. - - :param site_url: Root URL for the Invenio instance to use. - :param record_id: The record that sould be resolved. - :return: The record id of the latest version for the requested record. - """ - res = requests.get(f"{site_url}/api/records/{record_id}") - if res.status_code != 200: - raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - - res_json = res.json() - res = requests.get(res_json['links']['latest']) - if res.status_code != 200: - raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - - res_json = res.json() - return res_json['id'], res_json['metadata'] - - -def _codemeta_to_invenio_deposition(ctx: CodeMetaContext) -> dict: - """The mapping logic. - - Functionality similar to this exists in the ``convert_codemeta`` package which uses - the crosswalk tables to do the mapping: - - .. code-block:: python - - invenio_metadata = convert_codemeta.crosswalk( - metadata, "codemeta", "Zenodo" - ) - - Unfortunately, this doesn't work well with additional metadata in the same dict, so - it is safer to provide our own implementation. - - Currently, this function handles a lot of cases which we want to be able to - configure. A simple mapping from one JSON path to another is not enough. - - The metadata expected by Zenodo is described in the `Zenodo Developers guide - `_. Unfortunately, there doesn't seem - to be a schema one can download in order to validate these metadata. There might be - differences between Invenio-based platforms. - """ - - metadata = ctx["codemeta"] - license = ctx["deposit"]["invenio"]["license"] - communities = ctx["deposit"]["invenio"]["communities"] - access_right = ctx["deposit"]["invenio"]["access_right"] - embargo_date = ctx["deposit"]["invenio"]["embargo_date"] - access_conditions = ctx["deposit"]["invenio"]["access_conditions"] - - creators = [ - # TODO: Distinguish between @type "Person" and others - { - k: v for k, v in { - # TODO: This is ugly - "affiliation": author.get("affiliation", {"legalName": None}).get("legalName"), - # Invenio wants "family, given". author.get("name") might not have this format. - "name": f"{author.get('familyName')}, {author.get('givenName')}" - if author.get("familyName") and author.get("givenName") - else author.get("name"), - # Invenio expects the ORCID without the URL part - "orcid": author.get("@id", "").replace("https://orcid.org/", "") or None, - }.items() if v is not None - } - for author in metadata["author"] - ] - - # This is not used at the moment. See comment below in `deposition_metadata` dict. - contributors = [ # noqa: F841 - # TODO: Distinguish between @type "Person" and others - { - k: v for k, v in { - # TODO: This is ugly - "affiliation": contributor.get("affiliation", {"legalName": None}).get("legalName"), - # Invenio wants "family, given". contributor.get("name") might not have this format. - "name": f"{contributor.get('familyName')}, {contributor.get('givenName')}" - if contributor.get("familyName") and contributor.get("givenName") - else contributor.get("name"), - # Invenio expects the ORCID without the URL part - "orcid": contributor.get("@id", "").replace("https://orcid.org/", "") or None, - # TODO: Many possibilities here. Get from config - "type": "ProjectMember", - }.items() if v is not None - } - # TODO: Filtering out "GitHub" should be done elsewhere - for contributor in metadata["contributor"] if contributor.get("name") != "GitHub" - ] - - # TODO: Use the fields currently set to `None`. - # Some more fields are available but they most likely don't relate to software - # publications targeted by hermes. - deposition_metadata = {k: v for k, v in { - # If upload_type is "publication"/"image", a publication_type/image_type must be - # specified. Since hermes targets software publications, this can be ignored and - # upload_type can be hard-coded to "software". - # TODO: Make this a constant maybe. - "upload_type": "software", - # IS0 8601-formatted date - # TODO: Maybe we want a different date? Then make this configurable. If not, - # this can be removed as it defaults to today. - "publication_date": date.today().isoformat(), - "title": metadata["name"], - "creators": creators, - # TODO: Use a real description here. Possible sources could be - # `tool.poetry.description` from pyproject.toml or `abstract` from - # CITATION.cff. This should then be stored in codemeta description field. - "description": metadata["name"], - "access_right": access_right, - "license": license, - "embargo_date": embargo_date, - "access_conditions": access_conditions, - # TODO: If a publisher already has assigned a DOI to the files we want to - # upload, it should be used here. In this case, Invenio will not give us a new - # one. Set "prereserve_doi" accordingly. - "doi": None, - # This prereserves a DOI that can then be added to the files before publishing - # them. - # TODO: Use the DOI we get back from this. - "prereserve_doi": True, - # TODO: A good source for this could be `tool.poetry.keywords` in pyproject.toml. - "keywords": None, - "notes": None, - "related_identifiers": None, - # TODO: Use `contributors`. In the case of the hermes workflow itself, the - # contributors are currently all in `creators` already. So for now, we set this - # to `None`. Change this when relationship between authors and contributors can - # be specified in the processing step. - "contributors": None, - "references": None, - "communities": communities, - "grants": None, - "subjects": None, - "version": metadata.get('version'), - }.items() if v is not None} - - return deposition_metadata - - -def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): - """Get Invenio license representation from CodeMeta. - - The license to use is extracted from the ``license`` field in the - :class:`CodeMetaContext` and converted into an appropiate license identifier to be - passed to an Invenio instance. - - A license according to CodeMeta may be a URL (text) or a CreativeWork. This function - only handles URLs. If a ``license`` field is present in the CodeMeta and it is not - of type :class:`str`, a :class:`RuntimeError` is raised. - - Invenio instances take a license string which refers to a license identifier. - Typically, Invenio instances offer licenses from https://opendefinition.org and - https://spdx.org. However, it is possible to mint PIDs for custom licenses. - - An API endpoint (usually ``/api/licenses``) can be used to check whether a given - license is supported by the Invenio instance. This function tries to retrieve the - license by the identifier at the end of the license URL path. If this identifier - does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no - license is given in the CodeMeta, ``None`` is returned. - """ - - license_url = ctx["codemeta"].get("license") - - if license_url is None: - return None - - if not isinstance(license_url, str): - raise RuntimeError( - "The given license in CodeMeta must be of type str. " - "Licenses of type 'CreativeWork' are not supported." - ) + self.invenio_ctx = self.ctx[self.invenio_context_path] - parsed_url = urlparse(license_url) - url_path = parsed_url.path.rstrip("/") - license_id = url_path.split("/")[-1] + def map_metadata(self) -> None: + """Map the harvested metadata onto the Invenio schema.""" - response = requests.get( - f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} - ) - if response.status_code == 404: - raise RuntimeError(f"Not a valid license identifier: {license_id}") - # Catch other problems - response.raise_for_status() + deposition_metadata = self._codemeta_to_invenio_deposition() + self.ctx.update(self.invenio_context_path["depositionMetadata"], deposition_metadata) - return response.json()["id"] + # Store a snapshot of the mapped data within the cache, useful for analysis, debugging, etc + with open(self.ctx.get_cache("deposit", self.platform_name, create=True), 'w') as invenio_json: + json.dump(deposition_metadata, invenio_json, indent=' ') + def is_initial_publication(self) -> bool: + latest_record_id = self.invenio_ctx.get("latestRecord", {}).get("id") + return latest_record_id is None -def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): - """Get Invenio community identifiers from config. + def create_initial_version(self) -> None: + """Create an initial version of a publication.""" - This function gets the communities to be used for the deposition on an Invenio-based - site from the config and checks their validity against the site's API. If one of the - identifiers can not be found on the site, a :class:`MisconfigurationError` is - raised. - """ + if not self.click_ctx.params['initial']: + raise RuntimeError("Please use `--initial` to make an initial deposition.") - communities = config.get("deposit").get("invenio", {}).get("communities") - if communities is None: - return None + response = self.client.new_deposit() + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Could not create initial deposit {response.url!r}") - session = requests.Session() - session.headers = {"User-Agent": hermes_user_agent} + deposit = response.json() + self.links.update(deposit["links"]) + _log.debug("Created initial version deposit: %s", self.links["html"]) - community_ids = [] - for community_id in communities: - url = f"{communities_api_url}/{community_id}" - response = session.get(url) - if response.status_code == 404: - raise MisconfigurationError( - f"Not a valid community identifier: {community_id}" - ) - # Catch other problems - response.raise_for_status() - community_ids.append({"identifier": response.json()["id"]}) + def create_new_version(self) -> None: + """Create a new version of an existing publication.""" - return community_ids + latest_record_id = self.invenio_ctx.get("latestRecord", {}).get("id") + # Get current deposit + response = self.client.get_deposit(latest_record_id) + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Failed to get current deposit {response.url!r}") -def _get_access_modalities(license): - """Get access right, embargo date and access conditions based on configuration and given license. + self.links.update(response.json()["links"]) - This function implements the rules laid out in the `Zenodo developer documentation - `_: + # Create a new version using the newversion action + deposit_url = self.links["newversion"] + response = self.client.post(deposit_url) + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Could not create new version deposit {deposit_url!r}") - - ``access_right`` is a controlled vocabulary - - embargoed access depositions need an embargo date - - restricted access depositions need access conditions - - open and embargoed access depositions need a license - - closed access depositions have no further requirements + # Store link to latest draft to be used in :func:`update_metadata`. + old_deposit = response.json() + self.links.update(old_deposit["links"]) - This function also makes sure that the given embargo date can be parsed as an ISO - 8601 string representation and that the access rights are given as a string. - """ - invenio_config = config.get("deposit").get("invenio", {}) + def update_metadata(self) -> None: + """Update the metadata of a draft.""" - access_right = invenio_config.get("access_right") - if access_right is None: - raise MisconfigurationError("deposit.invenio.access_right is not configured") + draft_url = self.links["latest_draft"] - access_right_options = ["open", "embargoed", "restricted", "closed"] - if access_right not in access_right_options: - raise MisconfigurationError( - "deposition.invenio.access_right must be one of: " - f"{', '.join(access_right_options)}" - ) + deposition_metadata = self.invenio_ctx["depositionMetadata"] - embargo_date = invenio_config.get("embargo_date") - if access_right == "embargoed" and embargo_date is None: - raise MisconfigurationError( - f"With access_right {access_right}, " - "deposit.invenio.embargo_date must be configured" + response = self.client.put( + draft_url, + json={"metadata": deposition_metadata} ) - if embargo_date is not None: - try: - datetime.fromisoformat(embargo_date) - except ValueError: + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Could not update metadata of draft {draft_url!r}") + + deposit = response.json() + self.links.update(deposit["links"]) + + _log.debug("Created new version deposit: %s", self.links["html"]) + with open(self.ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: + json.dump(deposit, deposit_file, indent=4) + + def delete_artifacts(self) -> None: + """Delete existing file artifacts. + + This is done so that files which existed in an earlier publication but don't exist + any more, are removed. Otherwise they would cause an error because the didn't change + between versions. + """ + # TODO: This needs to be implemented! + pass + + def upload_artifacts(self) -> None: + """Upload file artifacts to the deposit. + + We'll use the bucket API rather than the files API as it supports file sizes above + 100MB. + """ + + bucket_url = self.links["bucket"] + + files: list[click.Path] = self.click_ctx.params["file"] + for path_arg in files: + path = Path(path_arg) + + # This should not happen, as Click shall not accept dirs as arguments already. Zero trust anyway. + if not path.is_file(): + raise ValueError("Any given argument to be included in the deposit must be a file.") + + with open(path, "rb") as file_content: + response = self.client.put( + f"{bucket_url}/{path.name}", data=file_content, + ) + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Could not upload file {path.name!r} into bucket {bucket_url!r}") + + # This can potentially be used to verify the checksum + # file_resource = response.json() + + def publish(self) -> None: + """Publish the deposited record.""" + + publish_url = self.links["publish"] + response = self.client.post(publish_url) + if not response.ok: + _log.error("Server answered with error code %d:\n%s", response.status_code, response.text) + raise RuntimeError(f"Could not publish deposit via {publish_url!r}") + + record = response.json() + self.links.update(record["links"]) + + _log.info("Published record: %s", self.links["record_html"]) + + def _codemeta_to_invenio_deposition(self) -> dict: + """The mapping logic. + + Functionality similar to this exists in the ``convert_codemeta`` package which uses + the crosswalk tables to do the mapping: + + .. code-block:: python + + invenio_metadata = convert_codemeta.crosswalk( + metadata, "codemeta", "Zenodo" + ) + + Unfortunately, this doesn't work well with additional metadata in the same dict, so + it is safer to provide our own implementation. + + Currently, this function handles a lot of cases which we want to be able to + configure. A simple mapping from one JSON path to another is not enough. + + The metadata expected by Zenodo is described in the `Zenodo Developers guide + `_. Unfortunately, there doesn't seem + to be a schema one can download in order to validate these metadata. There might be + differences between Invenio-based platforms. + """ + + metadata = self.ctx["codemeta"] + license = self.invenio_ctx["license"] + communities = self.invenio_ctx["communities"] + access_right = self.invenio_ctx["access_right"] + embargo_date = self.invenio_ctx["embargo_date"] + access_conditions = self.invenio_ctx["access_conditions"] + + creators = [ + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": author.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". author.get("name") might not have this format. + "name": f"{author.get('familyName')}, {author.get('givenName')}" + if author.get("familyName") and author.get("givenName") + else author.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": author.get("@id", "").replace("https://orcid.org/", "") or None, + }.items() if v is not None + } + for author in metadata["author"] + ] + + # This is not used at the moment. See comment below in `deposition_metadata` dict. + contributors = [ # noqa: F841 + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": contributor.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". contributor.get("name") might not have this format. + "name": f"{contributor.get('familyName')}, {contributor.get('givenName')}" + if contributor.get("familyName") and contributor.get("givenName") + else contributor.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": contributor.get("@id", "").replace("https://orcid.org/", "") or None, + # TODO: Many possibilities here. Get from config + "type": "ProjectMember", + }.items() if v is not None + } + # TODO: Filtering out "GitHub" should be done elsewhere + for contributor in metadata["contributor"] if contributor.get("name") != "GitHub" + ] + + # TODO: Use the fields currently set to `None`. + # Some more fields are available but they most likely don't relate to software + # publications targeted by hermes. + deposition_metadata = {k: v for k, v in { + # If upload_type is "publication"/"image", a publication_type/image_type must be + # specified. Since hermes targets software publications, this can be ignored and + # upload_type can be hard-coded to "software". + # TODO: Make this a constant maybe. + "upload_type": "software", + # IS0 8601-formatted date + # TODO: Maybe we want a different date? Then make this configurable. If not, + # this can be removed as it defaults to today. + "publication_date": date.today().isoformat(), + "title": metadata["name"], + "creators": creators, + # TODO: Use a real description here. Possible sources could be + # `tool.poetry.description` from pyproject.toml or `abstract` from + # CITATION.cff. This should then be stored in codemeta description field. + "description": metadata["name"], + "access_right": access_right, + "license": license, + "embargo_date": embargo_date, + "access_conditions": access_conditions, + # TODO: If a publisher already has assigned a DOI to the files we want to + # upload, it should be used here. In this case, Invenio will not give us a new + # one. Set "prereserve_doi" accordingly. + "doi": None, + # This prereserves a DOI that can then be added to the files before publishing + # them. + # TODO: Use the DOI we get back from this. + "prereserve_doi": True, + # TODO: A good source for this could be `tool.poetry.keywords` in pyproject.toml. + "keywords": None, + "notes": None, + "related_identifiers": None, + # TODO: Use `contributors`. In the case of the hermes workflow itself, the + # contributors are currently all in `creators` already. So for now, we set this + # to `None`. Change this when relationship between authors and contributors can + # be specified in the processing step. + "contributors": None, + "references": None, + "communities": communities, + "grants": None, + "subjects": None, + "version": metadata.get('version'), + }.items() if v is not None} + + return deposition_metadata + + def _get_license_identifier(self) -> t.Optional[str]: + """Get Invenio license identifier that matches the given license URL. + + If no license is configured, ``None`` will be returned. + """ + license_url = self.ctx["codemeta"].get("license") + return self.resolver.resolve_license_id(license_url) + + def _get_community_identifiers(self): + """Get Invenio community identifiers from config. + + This function gets the communities to be used for the deposition on an Invenio-based + site from the config and checks their validity against the site's API. If one of the + identifiers can not be found on the site, a :class:`MisconfigurationError` is + raised. + """ + + communities = self.config.get("communities") + if communities is None: + return None + + community_ids = [] + for community_id in communities: + response = self.client.get_community(community_id) + if response.status_code == 404: + raise MisconfigurationError( + f"Not a valid community identifier: {community_id}" + ) + # Catch other problems + response.raise_for_status() + community_ids.append({"identifier": response.json()["id"]}) + + return community_ids + + def _get_access_modalities(self, license): + """Get access right, embargo date and access conditions based on configuration and given license. + + This function implements the rules laid out in the `Zenodo developer documentation + `_: + + - ``access_right`` is a controlled vocabulary + - embargoed access depositions need an embargo date + - restricted access depositions need access conditions + - open and embargoed access depositions need a license + - closed access depositions have no further requirements + + This function also makes sure that the given embargo date can be parsed as an ISO + 8601 string representation and that the access rights are given as a string. + """ + access_right = self.config.get("access_right") + if access_right is None: + raise MisconfigurationError(f"deposit.{self.platform_name}.access_right is not configured") + + access_right_options = ["open", "embargoed", "restricted", "closed"] + if access_right not in access_right_options: raise MisconfigurationError( - f"Could not parse deposit.invenio.embargo_date {embargo_date!r}. " - "Must be in ISO 8601 format." + f"deposition.{self.platform_name}.access_right must be one of: " + f"{', '.join(access_right_options)}" ) - access_conditions = invenio_config.get("access_conditions") - if access_right == "restricted" and access_conditions is None: - raise MisconfigurationError( - f"With access_right {access_right}, " - "deposit.invenio.access_conditions must be configured" - ) + embargo_date = self.config.get("embargo_date") + if access_right == "embargoed" and embargo_date is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + f"deposit.{self.platform_name}.embargo_date must be configured" + ) - if access_conditions is not None and not isinstance(access_conditions, str): - raise MisconfigurationError( - "deposit.invenio.access_conditions must be a string (HTML is allowed)." - ) + if embargo_date is not None: + try: + datetime.fromisoformat(embargo_date) + except ValueError: + raise MisconfigurationError( + f"Could not parse deposit.{self.platform_name}.embargo_date {embargo_date!r}. " + "Must be in ISO 8601 format." + ) + + access_conditions = self.config.get("access_conditions") + if access_right == "restricted" and access_conditions is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + f"deposit.{self.platform_name}.access_conditions must be configured" + ) - if license is None and access_right in ["open", "embargoed"]: - raise MisconfigurationError( - f"With access_right {access_right}, a license is required." - ) + if access_conditions is not None and not isinstance(access_conditions, str): + raise MisconfigurationError( + f"deposit.{self.platform_name}.access_conditions must be a string (HTML is allowed)." + ) - if access_right == "closed": - pass + if license is None and access_right in ["open", "embargoed"]: + raise MisconfigurationError( + f"With access_right {access_right}, a license is required." + ) + + if access_right == "closed": + pass - return access_right, embargo_date, access_conditions + return access_right, embargo_date, access_conditions diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py index 4199a326..ce7fa2b6 100644 --- a/src/hermes/commands/deposit/invenio_rdm.py +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -6,599 +6,74 @@ # SPDX-FileContributor: Oliver Bertuch # SPDX-FileContributor: Michael Meinel -import json -import logging import typing as t -from datetime import date, datetime -from pathlib import Path -from urllib.parse import urlparse -import click -import requests +from requests import HTTPError -from hermes import config -from hermes.commands.deposit.error import DepositionUnauthorizedError -from hermes.error import MisconfigurationError -from hermes.model.context import CodeMetaContext -from hermes.model.path import ContextPath -from hermes.utils import hermes_user_agent +from hermes.commands.deposit.invenio import ( + InvenioClient, + InvenioDepositPlugin, + InvenioResolver, +) -_DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" -_DEFAULT_COMMUNITIES_API_PATH = "api/communities" -_DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" +class InvenioRDMClient(InvenioClient): + DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" + platform_name = "invenio_rdm" -def prepare(click_ctx: click.Context, ctx: CodeMetaContext): - """Prepare the deposition on an Invenio-based platform. + def get_license(self, license_id: str): + return super().get_license(license_id.casefold()) - In this function we do the following: + def get_licenses(self): + return self.get(f"{self.site_url}/{self.licenses_api_path}?size=1000") - - resolve the latest published version of this publication (if any) - - check whether the current version (given in the CodeMeta) was already published - - check whether we have a valid license identifier (if any) - - check wether the communities are valid (if configured) - - check access modalities (access right, access conditions, embargo data, existence - of license) - - check whether required configuration options are present - - check whether an auth token is given - - update ``ctx`` with metadata collected during the checks - """ - if not click_ctx.params["auth_token"]: - raise DepositionUnauthorizedError("No auth token given for deposition platform") +class InvenioRDMResolver(InvenioResolver): + invenio_client_class = InvenioRDMClient - invenio_path = ContextPath.parse("deposit.invenio_rdm") - invenio_config = config.get("deposit").get("invenio_rdm", {}) - rec_id, rec_meta = _resolve_latest_invenio_id(ctx) + def resolve_license_id(self, license_url: t.Optional[str]) -> t.Optional[dict]: + """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the + vocabulary. - version = ctx["codemeta"].get("version") - if rec_meta and (version == rec_meta.get("version")): - raise ValueError(f"Version {version} already deposited.") + First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each + time, ouch...). - ctx.update(invenio_path['latestRecord'], {'id': rec_id, 'metadata': rec_meta}) + If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still + relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross + references are sought for. - site_url = invenio_config.get("site_url") - if site_url is None: - raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") + :return: The vocabulary record that is provided by InvenioRDM. + """ - licenses_api_path = invenio_config.get("api_paths", {}).get( - "licenses", _DEFAULT_LICENSES_API_PATH - ) - licenses_api_url = f"{site_url}/{licenses_api_path}" - license = _get_license_identifier(ctx, licenses_api_url) - ctx.update(invenio_path["license"], license) - - communities_api_path = invenio_config.get("api_paths", {}).get( - "communities", _DEFAULT_COMMUNITIES_API_PATH - ) - communities_api_url = f"{site_url}/{communities_api_path}" - communities = _get_community_identifiers(ctx, communities_api_url) - ctx.update(invenio_path["communities"], communities) - - access_right, embargo_date, access_conditions = _get_access_modalities(license) - ctx.update(invenio_path["access_right"], access_right) - ctx.update(invenio_path["embargo_date"], embargo_date) - ctx.update(invenio_path["access_conditions"], access_conditions) - - -def map_metadata(click_ctx: click.Context, ctx: CodeMetaContext): - """Map the harvested metadata onto the Invenio schema.""" - - deposition_metadata = _codemeta_to_invenio_deposition(ctx) - - metadata_path = ContextPath.parse("deposit.invenio_rdm.depositionMetadata") - ctx.update(metadata_path, deposition_metadata) - - # Store a snapshot of the mapped data within the cache, useful for analysis, debugging, etc - with open(ctx.get_cache("deposit", "invenio_rdm", create=True), 'w') as invenio_json: - json.dump(deposition_metadata, invenio_json, indent=' ') - - -def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): - """Create an initial version of a publication. - - If a previous publication exists, this function does nothing, leaving the work for - :func:`create_new_version`. - """ - - invenio_path = ContextPath.parse("deposit.invenio_rdm") - invenio_ctx = ctx[invenio_path] - latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") - - if latest_record_id is not None: - # A previous version exists. This means that we need to create a new version in - # the next step. Thus, there is nothing to do here. - return - - if not click_ctx.params['initial']: - raise RuntimeError("Please use `--initial` to make an initial deposition.") - - _log = logging.getLogger("cli.deposit.invenio_rdm") - - invenio_config = config.get("deposit").get("invenio_rdm", {}) - site_url = invenio_config["site_url"] - depositions_api_path = invenio_config.get("api_paths", {}).get( - "depositions", _DEFAULT_DEPOSITIONS_API_PATH - ) - - deposition_metadata = invenio_ctx["depositionMetadata"] - - deposit_url = f"{site_url}/{depositions_api_path}" - response = requests.post( - deposit_url, - json={"metadata": deposition_metadata}, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - ) - - if not response.ok: - _log.error("Webserver response: \n%s", response.text) - raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") - - deposit = response.json() - _log.debug("Created initial version deposit: %s", deposit["links"]["html"]) - with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: - json.dump(deposit, deposit_file, indent=4) - - ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) - ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) - - -def create_new_version(click_ctx: click.Context, ctx: CodeMetaContext): - """Create a new version of an existing publication. - - If no previous publication exists, this function does nothing because - :func:`create_initial_version` will have done the work. - """ - - invenio_path = ContextPath.parse("deposit.invenio_rdm") - invenio_ctx = ctx[invenio_path] - latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") - - if latest_record_id is None: - # No previous version exists. This means that an initial version was created in - # the previous step. Thus, there is nothing to do here. - return - - session = requests.Session() - session.headers = { - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - - invenio_config = config.get("deposit").get("invenio_rdm", {}) - site_url = invenio_config["site_url"] - depositions_api_path = invenio_config.get("api_paths", {}).get( - "depositions", _DEFAULT_DEPOSITIONS_API_PATH - ) - - # Get current deposit - deposit_url = f"{site_url}/{depositions_api_path}/{latest_record_id}" - response = session.get(deposit_url) - if not response.ok: - raise RuntimeError(f"Failed to get current deposit {deposit_url!r}") - - # Create a new version using the newversion action - deposit_url = response.json()["links"]["newversion"] - response = session.post(deposit_url) - if not response.ok: - raise RuntimeError(f"Could not create new version deposit {deposit_url!r}") - - # Store link to latest draft to be used in :func:`update_metadata`. - old_deposit = response.json() - ctx.update(invenio_path["links"]["latestDraft"], old_deposit['links']['latest_draft']) - - -def update_metadata(click_ctx: click.Context, ctx: CodeMetaContext): - """Update the metadata of a draft. - - If no draft is found in the context, it is assumed that no metadata has to be - updated (e.g. because an initial version was created already containing the - metadata). - """ - - invenio_path = ContextPath.parse("deposit.invenio_rdm") - invenio_ctx = ctx[invenio_path] - draft_url = invenio_ctx.get("links", {}).get("latestDraft") - - if draft_url is None: - return - - _log = logging.getLogger("cli.deposit.invenio_rdm") - - deposition_metadata = invenio_ctx["depositionMetadata"] - - response = requests.put( - draft_url, - json={"metadata": deposition_metadata}, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - ) - - if not response.ok: - raise RuntimeError(f"Could not update metadata of draft {draft_url!r}") - - deposit = response.json() - _log.debug("Created new version deposit: %s", deposit["links"]["html"]) - with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: - json.dump(deposit, deposit_file, indent=4) - - ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) - ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) - - -def delete_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): - """Delete existing file artifacts. - - This is done so that files which existed in an earlier publication but don't exist - any more, are removed. Otherwise they would cause an error because the didn't change - between versions. - """ - # TODO: This needs to be implemented! - pass - - -def upload_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): - """Upload file artifacts to the deposit. - - We'll use the bucket API rather than the files API as it supports file sizes above - 100MB. The URL to the bucket of the deposit is taken from the context at - ``deposit.invenio.links.bucket``. - """ - - bucket_url_path = ContextPath.parse("deposit.invenio_rdm.links.bucket") - bucket_url = ctx[bucket_url_path] - - session = requests.Session() - session.headers = { - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}", - } - - files: list[click.Path] = click_ctx.params["file"] - for path_arg in files: - path = Path(path_arg) - - # This should not happen, as Click shall not accept dirs as arguments already. Zero trust anyway. - if not path.is_file(): - raise ValueError("Any given argument to be included in the deposit must be a file.") - - with open(path, "rb") as file_content: - response = session.put( - f"{bucket_url}/{path.name}", - data=file_content - ) - if not response.ok: - raise RuntimeError(f"Could not upload file {path.name!r} into bucket {bucket_url!r}") - - # This can potentially be used to verify the checksum - # file_resource = response.json() - - -def publish(click_ctx: click.Context, ctx: CodeMetaContext): - """Publish the deposited record. - - This is done by doing a POST request to the publication URL stored in the context at - ``deposit.invenio.links.publish``. - """ - - _log = logging.getLogger("cli.deposit.invenio_rdm") - - publish_url_path = ContextPath.parse("deposit.invenio_rdm.links.publish") - publish_url = ctx[publish_url_path] - - response = requests.post( - publish_url, - headers={ - "User-Agent": hermes_user_agent, - "Authorization": f"Bearer {click_ctx.params['auth_token']}" - } - ) - - if not response.ok: - _log.debug(response.text) - raise RuntimeError(f"Could not publish deposit via {publish_url!r}") - - record = response.json() - _log.info("Published record: %s", record["links"]["record_html"]) - - -def _resolve_latest_invenio_id(ctx: CodeMetaContext) -> t.Tuple[str, dict]: - """ - Using the given configuration and metadata, figure out the latest record id. - - If a record id is present as configuration ``deposit.invenio.record_id`` this one will be used to identify the - latest version of the record. Otherwise, if there is a doi present (either as configuration with key - ``deposit.invenio.doi`` or as a codemeta identifier, the DOI will be used to resolve the base record id. - - Anyway, the record id will always be used to resolve the latest version. - - If any of the resolution steps fail or produce an unexpected result, a ValueError will be thrown. - - :param ctx: The context for which the record id should be resolved. - :return: The Invenio record id and the metadata of the record - """ - - invenio_config = config.get('deposit').get('invenio_rdm', {}) - site_url = invenio_config.get('site_url') - if site_url is None: - raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") - - # Check if we configured an Invenio record ID (of the concept...) - record_id = invenio_config.get('record_id') - if record_id is None: - doi = invenio_config.get('doi') - if doi is None: - try: - # TODO: There might be more semantic in the codemeta.identifier... (also see schema.org) - identifier = ctx['codemeta.identifier'] - if identifier.startswith('https://doi.org/'): - doi = identifier[16:] - elif identifier.startswith('http://dx.doi.org/'): - doi = identifier[18:] - except KeyError: - pass - - if doi is not None: - # If we got a DOI, resolve it (using doi.org) into a Invenio URL ... and extract the record id. - record_id = _invenio_resolve_doi(site_url, doi) - - if record_id is not None: - # If we got a record id by now, resolve it using the Invenio API to the latests record. - return _invenio_resolve_record_id(site_url, record_id) - - return None, {} - - -def _invenio_resolve_doi(site_url, doi) -> str: - """ - Resolve an DOI to a Invenio URL and extract the record id. - - :param site_url: Root URL for the Invenio instance to use. - :param doi: The DOI to be resolved (only the identifier *without* the ``https://doi.org/`` prefix). - :return: The record ID on the respective instance. - """ - - res = requests.get(f'https://doi.org/{doi}') - - # This is a mean hack due to DataCite answering a 404 with a 200 status - if res.url == 'https://datacite.org/404.html': - raise ValueError(f"Invalid DOI: {doi}") - - # Ensure the resolved record is on the correct instance - if not res.url.startswith(site_url): - raise ValueError(f"{res.url} is not on configured host {site_url}.") - - # Extract the record id as last part of the URL path - page_url = urlparse(res.url) - *_, record_id = page_url.path.split('/') - return record_id - - -def _invenio_resolve_record_id(site_url: str, record_id: str) -> t.Tuple[str, dict]: - """ - Find the latest version of a given record. - - :param site_url: Root URL for the Invenio instance to use. - :param record_id: The record that sould be resolved. - :return: The record id of the latest version for the requested record. - """ - res = requests.get(f"{site_url}/api/records/{record_id}") - if res.status_code != 200: - raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - - res_json = res.json() - res = requests.get(res_json['links']['latest']) - if res.status_code != 200: - raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") - - res_json = res.json() - return res_json['id'], res_json['metadata'] - - -def _codemeta_to_invenio_deposition(ctx: CodeMetaContext) -> dict: - """The mapping logic. - - Functionality similar to this exists in the ``convert_codemeta`` package which uses - the crosswalk tables to do the mapping: - - .. code-block:: python - - invenio_metadata = convert_codemeta.crosswalk( - metadata, "codemeta", "Zenodo" - ) - - Unfortunately, this doesn't work well with additional metadata in the same dict, so - it is safer to provide our own implementation. - - Currently, this function handles a lot of cases which we want to be able to - configure. A simple mapping from one JSON path to another is not enough. - - The metadata expected by Zenodo is described in the `Zenodo Developers guide - `_. Unfortunately, there doesn't seem - to be a schema one can download in order to validate these metadata. There might be - differences between Invenio-based platforms. - """ - - metadata = ctx["codemeta"] - license = ctx["deposit"]["invenio_rdm"]["license"] - communities = ctx["deposit"]["invenio_rdm"]["communities"] - access_right = ctx["deposit"]["invenio_rdm"]["access_right"] - embargo_date = ctx["deposit"]["invenio_rdm"]["embargo_date"] - access_conditions = ctx["deposit"]["invenio_rdm"]["access_conditions"] - - creators = [ - # TODO: Distinguish between @type "Person" and others - { - k: v for k, v in { - # TODO: This is ugly - "affiliation": author.get("affiliation", {"legalName": None}).get("legalName"), - # Invenio wants "family, given". author.get("name") might not have this format. - "name": f"{author.get('familyName')}, {author.get('givenName')}" - if author.get("familyName") and author.get("givenName") - else author.get("name"), - # Invenio expects the ORCID without the URL part - "orcid": author.get("@id", "").replace("https://orcid.org/", "") or None, - }.items() if v is not None - } - for author in metadata["author"] - ] - - # This is not used at the moment. See comment below in `deposition_metadata` dict. - contributors = [ # noqa: F841 - # TODO: Distinguish between @type "Person" and others - { - k: v for k, v in { - # TODO: This is ugly - "affiliation": contributor.get("affiliation", {"legalName": None}).get("legalName"), - # Invenio wants "family, given". contributor.get("name") might not have this format. - "name": f"{contributor.get('familyName')}, {contributor.get('givenName')}" - if contributor.get("familyName") and contributor.get("givenName") - else contributor.get("name"), - # Invenio expects the ORCID without the URL part - "orcid": contributor.get("@id", "").replace("https://orcid.org/", "") or None, - # TODO: Many possibilities here. Get from config - "type": "ProjectMember", - }.items() if v is not None - } - # TODO: Filtering out "GitHub" should be done elsewhere - for contributor in metadata["contributor"] if contributor.get("name") != "GitHub" - ] - - # TODO: Use the fields currently set to `None`. - # Some more fields are available but they most likely don't relate to software - # publications targeted by hermes. - deposition_metadata = {k: v for k, v in { - # If upload_type is "publication"/"image", a publication_type/image_type must be - # specified. Since hermes targets software publications, this can be ignored and - # upload_type can be hard-coded to "software". - # TODO: Make this a constant maybe. - "upload_type": "software", - # IS0 8601-formatted date - # TODO: Maybe we want a different date? Then make this configurable. If not, - # this can be removed as it defaults to today. - "publication_date": date.today().isoformat(), - "title": metadata["name"], - "creators": creators, - # TODO: Use a real description here. Possible sources could be - # `tool.poetry.description` from pyproject.toml or `abstract` from - # CITATION.cff. This should then be stored in codemeta description field. - "description": metadata["name"], - "access_right": access_right, - "license": license, - "embargo_date": embargo_date, - "access_conditions": access_conditions, - # TODO: If a publisher already has assigned a DOI to the files we want to - # upload, it should be used here. In this case, Invenio will not give us a new - # one. Set "prereserve_doi" accordingly. - "doi": None, - # This prereserves a DOI that can then be added to the files before publishing - # them. - # TODO: Use the DOI we get back from this. - "prereserve_doi": True, - # TODO: A good source for this could be `tool.poetry.keywords` in pyproject.toml. - "keywords": None, - "notes": None, - "related_identifiers": None, - # TODO: Use `contributors`. In the case of the hermes workflow itself, the - # contributors are currently all in `creators` already. So for now, we set this - # to `None`. Change this when relationship between authors and contributors can - # be specified in the processing step. - "contributors": None, - "references": None, - "communities": communities, - "grants": None, - "subjects": None, - "version": metadata.get('version'), - }.items() if v is not None} - - return deposition_metadata - - -def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): - """Get Invenio license representation from CodeMeta. - - The license to use is extracted from the ``license`` field in the - :class:`CodeMetaContext` and converted into an appropiate license identifier to be - passed to an Invenio instance. - - A license according to CodeMeta may be a URL (text) or a CreativeWork. This function - only handles URLs. If a ``license`` field is present in the CodeMeta and it is not - of type :class:`str`, a :class:`RuntimeError` is raised. - - Invenio instances take a license string which refers to a license identifier. - Typically, Invenio instances offer licenses from https://opendefinition.org and - https://spdx.org. However, it is possible to mint PIDs for custom licenses. - - An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given - license is supported by the Invenio instance. This function tries to retrieve the - license by lower-casing the identifier at the end of the license URL path. If this identifier - does not exist on the Invenio instance, all available licenses are fetched and the URL is sought - for in the results. However, this might again not lead to success (as Invenio still provides - the obsolete https://opensource.org URLs) but harvesters might provide the SPDX style URLs. - Hence, the license URL is checked whether it is pointing to https://spdx.org/licenses/ and if - this is the case, the license record from SPDX is fetched and all `crossRef` URLs that are flagged - `isValid` are again sought for in the full set of licenses. Only if this still fails, - a :class:`RuntimeError` is raised. - - If no license is given in the CodeMeta, ``None`` is returned. - """ - - license_url = ctx["codemeta"].get("license") - - if license_url is None: - return None - - if not isinstance(license_url, str): - raise RuntimeError( - "The given license in CodeMeta must be of type str. " - "Licenses of type 'CreativeWork' are not supported." - ) - - # First try: Look up license by assuming lower-case name is the correct identifier - parsed_url = urlparse(license_url) - url_path = parsed_url.path.rstrip("/") - license_id = url_path.split("/")[-1].lower() - - response = requests.get( - f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} - ) - if response.ok: - license_info = response.json() - - # Second try: Fetch full list of licenses available... maybe we should cache this. - else: - license_info = _look_up_license_info(license_api_url, license_url) - - return license_info["id"] + # First try to resolve using the simple way that worked well with Zenodo before InvenioRDM + try: + return super().resolve_license_id(license_url) + except HTTPError: + pass + # If the easy "mapping" did not work, we really need to "search" for the correct license ID. + response = self.client.get_licenses() + response.raise_for_status() + valid_licenses = response.json() -def _look_up_license_info(license_api_url, license_url): - """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the - vocabulary. + license_info = self._search_license_info(license_url, valid_licenses) + if license_info is None and license_url.startswith('https://spdx.org/licenses/'): + response = self.client.get(f"{license_url}.json") + response.raise_for_status() - First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each - time, ouch...). + for license_cross_ref in response.json()['crossRef']: + if not license_cross_ref['isValid']: + continue - If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still - relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross references - are sought for. + license_info = self._search_license_info(license_cross_ref["url"], valid_licenses) + if license_info is not None: + break + else: + raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") - :param license_api_url: Base API endpoint for InvenioRDM license vocabulary queries. - :param license_url: The URL for the license we are search an identifier for. - :return: The vocabulary record that is provided by InvenioRDM. - """ - response = requests.get( - f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} - ) - response.raise_for_status() - valid_licenses = response.json() + return license_info - def _search_license_info(_url): + def _search_license_info(self, _url: str, valid_licenses: dict) -> t.Optional[dict]: for license_info in valid_licenses['hits']['hits']: try: if license_info['props']['url'] == _url: @@ -608,117 +83,8 @@ def _search_license_info(_url): else: return None - license_info = _search_license_info(license_url) - if license_info is None and license_url.startswith('https://spdx.org/licenses/'): - response = requests.get(f"{license_url}.json", headers={"User-Agent": hermes_user_agent}) - response.raise_for_status() - - for license_cross_ref in response.json()['crossRef']: - if not license_cross_ref['isValid']: - continue - - license_info = _search_license_info(license_cross_ref["url"]) - if license_info is not None: - break - else: - raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") - - return license_info - - -def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): - """Get Invenio community identifiers from config. - - This function gets the communities to be used for the deposition on an Invenio-based - site from the config and checks their validity against the site's API. If one of the - identifiers can not be found on the site, a :class:`MisconfigurationError` is - raised. - """ - - communities = config.get("deposit").get("invenio_rdm", {}).get("communities") - if communities is None: - return None - - session = requests.Session() - session.headers = {"User-Agent": hermes_user_agent} - - community_ids = [] - for community_id in communities: - url = f"{communities_api_url}/{community_id}" - response = session.get(url) - if response.status_code == 404: - raise MisconfigurationError( - f"Not a valid community identifier: {community_id}" - ) - # Catch other problems - response.raise_for_status() - community_ids.append({"identifier": response.json()["id"]}) - - return community_ids - - -def _get_access_modalities(license): - """Get access right, embargo date and access conditions based on configuration and given license. - - This function implements the rules laid out in the `Zenodo developer documentation - `_: - - - ``access_right`` is a controlled vocabulary - - embargoed access depositions need an embargo date - - restricted access depositions need access conditions - - open and embargoed access depositions need a license - - closed access depositions have no further requirements - - This function also makes sure that the given embargo date can be parsed as an ISO - 8601 string representation and that the access rights are given as a string. - """ - invenio_config = config.get("deposit").get("invenio_rdm", {}) - - access_right = invenio_config.get("access_right") - if access_right is None: - raise MisconfigurationError("deposit.invenio_rdm.access_right is not configured") - - access_right_options = ["open", "embargoed", "restricted", "closed"] - if access_right not in access_right_options: - raise MisconfigurationError( - "deposition.invenio_rdm.access_right must be one of: " - f"{', '.join(access_right_options)}" - ) - - embargo_date = invenio_config.get("embargo_date") - if access_right == "embargoed" and embargo_date is None: - raise MisconfigurationError( - f"With access_right {access_right}, " - "deposit.invenio_rdm.embargo_date must be configured" - ) - - if embargo_date is not None: - try: - datetime.fromisoformat(embargo_date) - except ValueError: - raise MisconfigurationError( - f"Could not parse deposit.invenio_rdm.embargo_date {embargo_date!r}. " - "Must be in ISO 8601 format." - ) - - access_conditions = invenio_config.get("access_conditions") - if access_right == "restricted" and access_conditions is None: - raise MisconfigurationError( - f"With access_right {access_right}, " - "deposit.invenio_rdm.access_conditions must be configured" - ) - - if access_conditions is not None and not isinstance(access_conditions, str): - raise MisconfigurationError( - "deposit.invenio_rdm.access_conditions must be a string (HTML is allowed)." - ) - - if license is None and access_right in ["open", "embargoed"]: - raise MisconfigurationError( - f"With access_right {access_right}, a license is required." - ) - - if access_right == "closed": - pass - return access_right, embargo_date, access_conditions +class IvenioRDMDepositPlugin(InvenioDepositPlugin): + platform_name = "invenio_rdm" + invenio_client_class = InvenioRDMClient + invenio_resolver_class = InvenioRDMResolver diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index c1ad6c59..bfa2bc0f 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -16,6 +16,7 @@ import click from hermes import config +from hermes.commands.deposit.base import BaseDepositPlugin from hermes.error import MisconfigurationError from hermes.model.context import HermesContext, HermesHarvestContext, CodeMetaContext from hermes.model.errors import MergeError @@ -189,65 +190,34 @@ def deposit(click_ctx: click.Context, initial, auth_token, file): deposit_config = config.get("deposit") - # This is used as the default value for all entry point names for the deposit step - target_platform = deposit_config.get("target", "invenio") - - entry_point_groups = [ - "hermes.deposit.prepare", - "hermes.deposit.map", - "hermes.deposit.create_initial_version", - "hermes.deposit.create_new_version", - "hermes.deposit.update_metadata", - "hermes.deposit.delete_artifacts", - "hermes.deposit.upload_artifacts", - "hermes.deposit.publish", - ] - - # For each group, an entry point can be configured via ``deposit_config`` using the - # the part after the last dot as the config key. If no such key is found, the target - # platform value is used to search for an entry point in the respective group. - selected_entry_points = { - group: deposit_config.get(group.split(".")[-1], target_platform) - for group in entry_point_groups - } - - # Try to load all entrypoints first, so we don't fail because of misconfigured - # entry points while some tasks of the deposition step were already started. (E.g. - # new version was already created on the deposition platform but artifact upload - # fails due to the entry point not being found.) - loaded_entry_points = [] - for group, name in selected_entry_points.items(): - try: - ep, *eps = metadata.entry_points(group=group, name=name) - except ValueError: # not enough values to unpack - if name != target_platform: - _log.error( - f"Explicitly configured entry point name {name!r} " - f"not found in group {group!r}" - ) - click_ctx.exit(1) - _log.debug( - f"Group {group!r} has no entry point with name {name!r}; skipping" - ) - continue + plugin_group = "hermes.deposit" + # TODO: Is having a default a good idea? + # TODO: Should we allow a list here so that multiple plugins are run? + plugin_name = deposit_config.get("target", "invenio") + try: + ep, *eps = metadata.entry_points(group=plugin_group, name=plugin_name) if eps: # Entry point names in these groups refer to the deposition platforms. For # each platform, only a single implementation should exist. Otherwise we # would not be able to decide which implementation to choose. _log.error( - f"Entry point name {name!r} is not unique within group {group!r}" + f"Plugin name {plugin_name!r} is not unique within group {plugin_group!r}" ) click_ctx.exit(1) + except ValueError: # not enough values to unpack + _log.error(f"Plugin name {plugin_name!r} was not found in group {plugin_group!r}") + click_ctx.exit(1) - loaded_entry_points.append(ep.load()) + # TODO: Could this raise an exception? + deposit_plugin_class: BaseDepositPlugin = ep.load() + deposit_plugin = deposit_plugin_class(click_ctx, ctx) - for entry_point in loaded_entry_points: - try: - entry_point(click_ctx, ctx) - except (RuntimeError, MisconfigurationError) as e: - _log.error(f"Error in {group!r} entry point {name!r}: {e}") - click_ctx.exit(1) + try: + deposit_plugin() + except (RuntimeError, MisconfigurationError) as e: + _log.error(f"Error in {plugin_group!r} plugin {plugin_name!r}: {e}") + click_ctx.exit(1) @click.group(invoke_without_command=True) diff --git a/test/hermes_test/commands/deposit/test_invenio.py b/test/hermes_test/commands/deposit/test_invenio.py index d7c4beed..79df30ee 100644 --- a/test/hermes_test/commands/deposit/test_invenio.py +++ b/test/hermes_test/commands/deposit/test_invenio.py @@ -7,32 +7,59 @@ from unittest import mock +import click import pytest from hermes.commands.deposit import invenio from hermes.error import MisconfigurationError -def test_resolve_doi(requests_mock): +@pytest.fixture +def resolver(): + with mock.patch("hermes.config.get") as mocked_deposit_config: + mocked_deposit_config.return_value = { + "invenio": { + "site_url": "https://invenio.example.com", + } + } + r = invenio.InvenioResolver() + return r + + +@pytest.fixture +def depositor(): + click_ctx = click.Context(click.Command("deposit")) + click_ctx.params.update({"auth_token": ""}) + with mock.patch("hermes.config.get") as mocked_deposit_config: + mocked_deposit_config.return_value = { + "invenio": { + "site_url": "https://invenio.example.com", + } + } + d = invenio.InvenioDepositPlugin(click_ctx, None) + return d + + +def test_resolve_doi(requests_mock, resolver): requests_mock.get('https://doi.org/123.45/foo.bar-6789', status_code=302, - headers={'Location': 'https://foo.bar/record/6789'}) - requests_mock.get('https://foo.bar/record/6789') + headers={'Location': 'https://invenio.example.com/record/6789'}) + requests_mock.get('https://invenio.example.com/record/6789') - assert invenio._invenio_resolve_doi('https://foo.bar', '123.45/foo.bar-6789') == '6789' + assert resolver.resolve_doi('123.45/foo.bar-6789') == '6789' -def test_resolve_doi_wrong_host(requests_mock): +def test_resolve_doi_wrong_host(requests_mock, resolver): requests_mock.get('https://doi.org/123.45/foo.bar-6789', status_code=302, - headers={'Location': 'https://foo.baz/record/6789'}) - requests_mock.get('https://foo.baz/record/6789') + headers={'Location': 'https://not.invenio.example.com/record/6789'}) + requests_mock.get('https://not.invenio.example.com/record/6789') with pytest.raises(ValueError): - invenio._invenio_resolve_doi('https://foo.bar', '123.45/foo.bar-6789') + resolver.resolve_doi('123.45/foo.bar-6789') -def test_resolve_doi_unknown(requests_mock): +def test_resolve_doi_unknown(requests_mock, resolver): requests_mock.get('https://doi.org/123.45/foo.bar-6789', status_code=302, headers={'Location': 'https://datacite.org/404.html'}) @@ -41,143 +68,101 @@ def test_resolve_doi_unknown(requests_mock): requests_mock.get('https://datacite.org/404.html', status_code=200) with pytest.raises(ValueError): - invenio._invenio_resolve_doi('https://foo.bar', '123.45/foo.bar-6789') + resolver.resolve_doi('123.45/foo.bar-6789') -def test_resolve_record_id(requests_mock): - requests_mock.get('https://foo.bar/api/records/6789', - text='{"links":{"latest":"https://foo.bar/api/records/12345"}}') - requests_mock.get('https://foo.bar/api/records/12345', text='{"id":"12345","metadata":{"mock":42}}') +def test_resolve_record_id(requests_mock, resolver): + requests_mock.get('https://invenio.example.com/api/records/6789', + text='{"links":{"latest":"https://invenio.example.com/api/records/12345"}}') + requests_mock.get('https://invenio.example.com/api/records/12345', text='{"id":"12345","metadata":{"mock":42}}') - assert invenio._invenio_resolve_record_id('https://foo.bar', '6789') == ('12345', {"mock": 42}) + assert resolver.resolve_record_id('6789') == ('12345', {"mock": 42}) -def test_resolve_record_id_unknown(requests_mock): - requests_mock.get('https://foo.bar/api/records/6789', status_code=404, text="Not found") +def test_resolve_record_id_unknown(requests_mock, resolver): + requests_mock.get('https://invenio.example.com/api/records/6789', status_code=404, text="Not found") with pytest.raises(ValueError): - invenio._invenio_resolve_record_id('https://foo.bar', '6789') + resolver.resolve_record_id('6789') -def test_resolve_record_id_latest_unknown(requests_mock): - requests_mock.get('https://foo.bar/api/records/6789', - text='{"links":{"latest":"https://foo.bar/api/records/12345"}}') - requests_mock.get('https://foo.bar/api/records/12345', status_code=404) +def test_resolve_record_id_latest_unknown(requests_mock, resolver): + requests_mock.get('https://invenio.example.com/api/records/6789', + text='{"links":{"latest":"https://invenio.example.com/api/records/12345"}}') + requests_mock.get('https://invenio.example.com/api/records/12345', status_code=404) with pytest.raises(ValueError): - invenio._invenio_resolve_record_id('https://foo.bar', '6789') + resolver.resolve_record_id('6789') -def test_get_access_modalities_closed(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'closed', - } - } - access_right, _, _ = invenio._get_access_modalities(None) - assert access_right == "closed" +def test_get_access_modalities_closed(depositor): + depositor.config.update({'access_right': 'closed'}) + access_right, _, _ = depositor._get_access_modalities(None) + assert access_right == "closed" -def test_get_access_modalities_embargoed_no_date_no_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'embargoed', - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities(None) +def test_get_access_modalities_embargoed_no_date_no_license(depositor): + depositor.config.update({'access_right': 'embargoed'}) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities(None) -def test_get_access_modalities_embargoed_no_date_with_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'embargoed', - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities("Apache-2.0") +def test_get_access_modalities_embargoed_no_date_with_license(depositor): + depositor.config.update({'access_right': 'embargoed'}) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities("Apache-2.0") -def test_get_access_modalities_embargoed_with_date_with_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'embargoed', - 'embargo_date': '2050-05-01', - } - } - access_right, embargo_date, _ = invenio._get_access_modalities("Apache-2.0") - assert access_right == "embargoed" - assert embargo_date == "2050-05-01" +def test_get_access_modalities_embargoed_with_date_with_license(depositor): + depositor.config.update({ + 'access_right': 'embargoed', + 'embargo_date': '2050-05-01', + }) + access_right, embargo_date, _ = depositor._get_access_modalities("Apache-2.0") + assert access_right == "embargoed" + assert embargo_date == "2050-05-01" -def test_get_access_modalities_embargoed_with_broken_date_with_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'embargoed', - 'embargo_date': 'not-a-date', - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities("Apache-2.0") +def test_get_access_modalities_embargoed_with_broken_date_with_license(depositor): + depositor.config.update({ + 'access_right': 'embargoed', + 'embargo_date': 'not-a-date', + }) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities("Apache-2.0") -def test_get_access_modalities_restricted_no_conditions(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'restricted', - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities(None) +def test_get_access_modalities_restricted_no_conditions(depositor): + depositor.config.update({'access_right': 'restricted'}) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities(None) -def test_get_access_modalities_restricted_with_conditions(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'restricted', - 'access_conditions': 'You must be cool', - } - } - access_right, _, access_conditions = invenio._get_access_modalities(None) - assert access_right == "restricted" - assert access_conditions == "You must be cool" +def test_get_access_modalities_restricted_with_conditions(depositor): + depositor.config.update({ + 'access_right': 'restricted', + 'access_conditions': 'You must be cool', + }) + access_right, _, access_conditions = depositor._get_access_modalities(None) + assert access_right == "restricted" + assert access_conditions == "You must be cool" -def test_get_access_modalities_open_no_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'open', - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities(None) +def test_get_access_modalities_open_no_license(depositor): + depositor.config.update({'access_right': 'open'}) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities(None) -def test_get_access_modalities_open_with_license(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'open', - } - } - access_right, _, _ = invenio._get_access_modalities("Apache-2.0") - assert access_right == "open" +def test_get_access_modalities_open_with_license(depositor): + depositor.config.update({'access_right': 'open'}) + access_right, _, _ = depositor._get_access_modalities("Apache-2.0") + assert access_right == "open" -def test_get_access_modalities_broken_access_right(): - with mock.patch("hermes.config.get") as mocked_deposit_config: - mocked_deposit_config.return_value = { - 'invenio': { - 'access_right': 'unknown', # does not exist - } - } - with pytest.raises(MisconfigurationError): - invenio._get_access_modalities(None) +def test_get_access_modalities_broken_access_right(depositor): + depositor.config.update({ + 'access_right': 'unknown', # does not exist + }) + with pytest.raises(MisconfigurationError): + depositor._get_access_modalities(None)