diff --git a/__init__.py b/__init__.py index 722df1424..90707339d 100644 --- a/__init__.py +++ b/__init__.py @@ -26,31 +26,32 @@ # Import all modules containing rules into the package namespace, # so that they become visible to iRODS. -from admin import * -from browse import * -from folder import * -from groups import * -from json_datacite import * -from json_landing_page import * -from mail import * -from meta import * -from meta_form import * -from provenance import * -from research import * -from resources import * -from schema import * -from schema_transformation import * -from schema_transformations import * -from vault import * -from datacite import * -from epic import * -from publication import * -from policies import * -from replication import * -from revisions import * -from settings import * -from notifications import * -from integration_tests import * +from admin import * +from browse import * +from folder import * +from groups import * +from json_datacite import * +from json_landing_page import * +from mail import * +from meta import * +from meta_form import * +from provenance import * +from research import * +from resources import * +from schema import * +from schema_transformation import * +from schema_transformations import * +from publication_troubleshoot import * +from vault import * +from datacite import * +from epic import * +from publication import * +from policies import * +from replication import * +from revisions import * +from settings import * +from notifications import * +from integration_tests import * # Import certain modules only when enabled. from .util.config import config diff --git a/integration_tests.py b/integration_tests.py index f345b5c58..f92f4c0dd 100644 --- a/integration_tests.py +++ b/integration_tests.py @@ -117,6 +117,27 @@ def _test_avu_rmw_collection(ctx, rmw_attributes): return result +def _test_avu_get_attr_val_of_coll(ctx, attr, value): + # Test getting the value of an attribute on a collection + tmp_coll = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz") + result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr) + collection.remove(ctx, tmp_coll) + return result + + +def _test_avu_get_attr_val_of_coll_exception(ctx): + # Test that getting a non existing attribute on a collection raises an exception (True for exception raised) + tmp_coll = _create_tmp_collection(ctx) + result = False + try: + result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo") + except Exception: + result = True + collection.remove(ctx, tmp_coll) + return result + + def _test_folder_set_retry_avus(ctx): tmp_coll = _create_tmp_collection(ctx) folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2) @@ -482,6 +503,12 @@ def _test_folder_secure_func(ctx, func): "check": lambda x: (("aap", "noot", "mies") in x and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 )}, + {"name": "avu.get_attr_val_of_coll.exists.yes", + "test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"), + "check": lambda x: x == "bar"}, + {"name": "avu.get_attr_val_of_coll.exists.no", + "test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx), + "check": lambda x: x}, {"name": "avu.apply_atomic_operations.collection", "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx), "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)}, diff --git a/meta.py b/meta.py index 887401958..797162ec5 100644 --- a/meta.py +++ b/meta.py @@ -13,6 +13,7 @@ import irods_types from deepdiff import DeepDiff +import meta_form import provenance import publication import schema as schema_ @@ -790,3 +791,50 @@ def copy_user_metadata(ctx, source, target): log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target)) except Exception: log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target)) + + +def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout): + """Process a single data package to retrieve and validate that its metadata conforms to the schema. + + :param ctx: Combined type of a callback and rei struct + :param coll_name: String representing the data package collection path. + :param schema_cache: Dictionary storing schema blueprints, can be empty. + :param report_name: Name of report script (for logging) + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A dictionary result containing if schema matches and the schema short name. + """ + metadata_path = get_latest_vault_metadata_path(ctx, coll_name) + + if not metadata_path: + log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout) + return None + + try: + metadata = jsonutil.read(ctx, metadata_path) + except Exception as exc: + log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout) + return None + + # Determine schema + schema_id = schema_.get_schema_id(ctx, metadata_path) + schema_shortname = schema_id.split("/")[-2] + + # Retrieve schema and cache it for future use + schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id) + if schema_shortname in schema_cache: + schema_contents = schema_cache[schema_shortname] + else: + schema_contents = jsonutil.read(ctx, schema_path) + schema_cache[schema_shortname] = schema_contents + + # Check whether metadata matches schema and log any errors + error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) + match_schema = len(error_list) == 0 + if not match_schema: + errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list] + log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout) + + return {"schema": schema_shortname, "match_schema": match_schema} diff --git a/publication.py b/publication.py index 47e7d3470..d97507197 100644 --- a/publication.py +++ b/publication.py @@ -1326,10 +1326,10 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp :param update_moai: Flag that indicates updating MOAI (OAI-PMH) """ if user.user_type(ctx) != 'rodsadmin': - log.write_stdout(ctx, "User is no rodsadmin") + log.write(ctx, "User is no rodsadmin", True) return - log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package)) + log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True) collections = genquery.row_iterator( "COLL_NAME", "COLL_NAME like '%%/home/vault-%%' " @@ -1345,12 +1345,12 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)): packages_found = True output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes') - log.write_stdout(ctx, coll_name + ': ' + output) + log.write(ctx, coll_name + ': ' + output, True) if not packages_found: - log.write_stdout(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package)) + log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True) else: - log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package)) + log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True) def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False): diff --git a/publication_troubleshoot.py b/publication_troubleshoot.py new file mode 100644 index 000000000..8f948fcbb --- /dev/null +++ b/publication_troubleshoot.py @@ -0,0 +1,440 @@ +# -*- coding: utf-8 -*- +"""Functions and rules for troubleshooting published data packages.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +__all__ = [ + 'api_batch_troubleshoot_published_data_packages', + 'rule_batch_troubleshoot_published_data_packages' +] + +import json +from datetime import datetime + +import genquery +import requests +import urllib3 + +import datacite +from meta import vault_metadata_matches_schema +from publication import get_publication_config +from util import * + + +def find_full_package_path(ctx, package_name, write_stdout): + """ + Find the full path of a data package based on its short name. + + :param ctx: Combined type of a callback and rei struct + :param package_name: The short name of the data package to find. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: The full path of the data package if found, otherwise None. + """ + try: + query_condition = ( + "COLL_NAME like '%{}%'".format(package_name) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Return full package path if exists + for row in iter: + return row[0] + except Exception as e: + log.write(ctx, "find_full_package_path: An error occurred while executing the query: {}".format(e), write_stdout) + return None + + +def find_data_packages(ctx, write_stdout): + """ + Find all data packages in Retry, Unrecoverable and Unknown status by matching its AVU. + + :param ctx: Combined type of a callback and rei struct + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A list of collection names that have not been processed successfully + """ + user_zone = user.zone(ctx) + + try: + # Get all the vault packages that have org_publication_status in metadata + query_condition = ( + "COLL_NAME like '/{}/home/vault-%' AND " + "META_COLL_ATTR_NAME = '{}publication_status'".format(user_zone, constants.UUORGMETADATAPREFIX) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Collecting only the collection names + return [row[0] for row in iter] + + except Exception as e: + log.write(ctx, "find_data_packages: An error occurred while executing the query: {}".format(e), write_stdout) + return [] + + +def check_print_data_package_system_avus(ctx, data_package, write_stdout): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + This also prints if there are any missing or unexpected results. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A 2-tuple containing boolean results of checking results + """ + extracted_avus = avu.of_coll(ctx, data_package) + results = misc.check_data_package_system_avus(extracted_avus) + + if not results["no_missing_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some missing AVUs in data package <{}> - {}".format(data_package, list(results["missing_avus"])), write_stdout) + + if not results["no_unexpected_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some unexpected AVUs in data package <{}> - {}".format(data_package, list(results["unexpected_avus"])), write_stdout) + + return (results["no_missing_avus"], results["no_unexpected_avus"]) + + +def check_one_datacite_doi_reg(ctx, data_package, doi_name, write_stdout): + try: + doi = get_val_for_attr_with_pub_prefix(ctx, data_package, doi_name) + except ValueError as e: + log.write(ctx, "check_datacite_doi_registration: Error while trying to get {} - {}".format(doi_name, e), write_stdout) + return False + + status_code = datacite.metadata_get(ctx, doi) + return status_code == 200 + + +def check_datacite_doi_registration(ctx, data_package, write_stdout): + """ + Check the registration status of both versionDOI and baseDOI with the DataCite API, + ensuring that both DOIs return a 200 status code, which indicates successful registration. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple of booleans indicating check success or not (base doi check may be None if not relevant). + """ + version_doi_check = check_one_datacite_doi_reg(ctx, data_package, "versionDOI", write_stdout) + + previous_version = '' + try: + previous_version = get_val_for_attr_with_pub_prefix(ctx, data_package, "previous_version") + except Exception: + pass + + if previous_version: + base_doi_check = check_one_datacite_doi_reg(ctx, data_package, "baseDOI", write_stdout) + return version_doi_check, base_doi_check + + return (version_doi_check, None) + + +def get_val_for_attr_with_pub_prefix(ctx, data_package, attribute_suffix): + """ + Retrieves the value given the suffix of the attribute from a data package. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param attribute_suffix: Suffix of the attribute before adding prefix such as "org_publication_" + + :returns: Value of the attribute. + """ + attr = constants.UUORGMETADATAPREFIX + "publication_" + attribute_suffix + return avu.get_attr_val_of_coll(ctx, data_package, attr) + + +def get_landingpage_paths(ctx, data_package, write_stdout): + """Given a data package get what the path and remote url should be""" + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPagePath") + url = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPageUrl") + return file_path, url + + except Exception: + log.write(ctx, "get_landingpage_paths: Could not find landing page for data package: {}".format(data_package), write_stdout) + return '', '' + + +def compare_local_remote_landingpage(ctx, file_path, url, offline, api_call): + """ + Compares file contents between a file in irods and its remote version to verify their integrity. + + :param ctx: Combined type of a callback and rei struct + :param file_path: Path to file in irods + :param url: URL of file on remote + :param offline: Whether to skip requests.get call + :param api_call: Boolean representing whether was called by api and not a script + + :returns: True if the file contents match, False otherwise + """ + write_stdout = not api_call + # Local/irods file + if api_call: + # If called by technicaladmin, only check that the file exists since we don't have access to the contents + return data_object.exists(ctx, file_path) + else: + try: + local_data = data_object.read(ctx, file_path) + except Exception: + log.write(ctx, "compare_local_remote_landingpage: Local file not found at path {}.".format(file_path), write_stdout) + return False + + if offline: + return len(local_data) > 0 + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "compare_local_remote_landingpage: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "compare_local_remote_landingpage: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "compare_local_remote_landingpage: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Set encoding to utf-8 for the response text (otherwise will not match local_data) + response.encoding = 'utf-8' + + if local_data == response.text: + return True + + log.write(ctx, "compare_local_remote_landingpage: File contents at irods path <{}> and remote landing page <{}> do not match.".format(file_path, url), write_stdout) + return False + + +def check_landingpage(ctx, data_package, offline, api_call): + """ + Checks the integrity of landing page by comparing the contents + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param offline: Whether to skip any checks that require external server access + :param api_call: Boolean of whether this is for an api call version of the troubleshooting script + + :returns: A tuple containing boolean results of checking + """ + irods_file_path, landing_page_url = get_landingpage_paths(ctx, data_package, not api_call) + if len(irods_file_path) == 0 or len(landing_page_url) == 0: + return False + + return compare_local_remote_landingpage(ctx, irods_file_path, landing_page_url, offline, api_call) + + +def check_combi_json(ctx, data_package, publication_config, offline, write_stdout): + """ + Checks the integrity of combi JSON by checking URL and existence of file. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param publication_config: Dictionary of publication config + :param offline: Whether to skip any checks that require external server access + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple containing boolean results of checking + """ + # Check that the combi json in irods exists + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "combiJsonPath") + except Exception: + pass + exists = data_object.exists(ctx, file_path) + if not exists: + log.write(ctx, "check_combi_json: combi JSON file in irods does not exist: {}".format(file_path), write_stdout) + return False + + if offline: + return True + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + # Get the version doi + version_doi = '' + try: + version_doi = get_val_for_attr_with_pub_prefix(ctx, data_package, "versionDOI") + except Exception: + pass + url = "https://{}/oai/oai?verb=GetRecord&metadataPrefix=oai_datacite&identifier=oai:{}".format(publication_config["publicVHost"], version_doi) + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "check_combi_json: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "check_combi_json: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "check_combi_json: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Look at the first few parts of the response for signs of error. + if "idDoesNotExist" in response.text[:5000]: + log.write(ctx, "check_combi_json: combiJson not found in oai for data package <{}>".format(data_package), write_stdout) + return False + + return True + + +def print_troubleshoot_result(ctx, data_package, result, datacite_check): + """Print the result of troubleshooting one package in human-friendly format""" + pass_all_tests = all(result.values()) + + log.write(ctx, "Results for: {}".format(data_package), True) + if pass_all_tests: + log.write(ctx, "Package passed all tests.", True) + else: + log.write(ctx, "Package FAILED one or more tests:", True) + log.write(ctx, "Schema matches: {}".format(result['schema_check']), True) + log.write(ctx, "All expected AVUs exist: {}".format(result['no_missing_AVUs_check']), True) + log.write(ctx, "No unexpected AVUs: {}".format(result['no_unexpected_AVUs_check']), True) + + if datacite_check: + log.write(ctx, "Version DOI matches: {}".format(result['versionDOI_check']), True) + if 'baseDOI_check' in result: + log.write(ctx, "Base DOI matches: {}".format(result['baseDOI_check']), True) + + log.write(ctx, "Landing page matches: {}".format(result['landingPage_check']), True) + log.write(ctx, "Combined JSON matches: {}".format(result['combiJson_check']), True) + + log.write(ctx, "", True) + + +def collect_troubleshoot_data_packages(ctx, requested_package, write_stdout): + data_packages = [] + + if requested_package == 'None': + # Retrieve all data packages + all_packages = find_data_packages(ctx, write_stdout) + if not all_packages: + log.write(ctx, "collect_troubleshoot_data_packages: No packages found.", write_stdout) + return None + + data_packages = all_packages + else: + # Get full path of the given package + full_package_path = find_full_package_path(ctx, requested_package, write_stdout) + + if not full_package_path: + log.write(ctx, "collect_troubleshoot_data_packages: Data package '{}' cannot be found.".format(requested_package), write_stdout) + return None + + data_packages.append(full_package_path) + + return data_packages + + +def batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, api_call, check_datacite): + """ + Troubleshoots published data packages. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + :param api_call: Boolean of whether this is run by a script or api test. + :param check_datacite: Boolean representing whether to do the datacite checks + + :returns: A dictionary of dictionaries providing the results of the job. + """ + write_stdout = not api_call + # Check permissions - rodsadmin only + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is not rodsadmin", write_stdout) + return {} + + data_packages = collect_troubleshoot_data_packages(ctx, requested_package, write_stdout) + if not data_packages: + return {} + schema_cache = {} + results = {} + + # Troubleshooting + for data_package in data_packages: + log.write(ctx, "Troubleshooting data package: {}".format(data_package), write_stdout) + result = {} + # Cannot check the metadata as technicaladmin + if not api_call: + schema_check_dict = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-publications", write_stdout) + result['schema_check'] = schema_check_dict['match_schema'] if schema_check_dict else False + + result['no_missing_AVUs_check'], result['no_unexpected_AVUs_check'] = check_print_data_package_system_avus(ctx, data_package, write_stdout) + + # Only check datacite if enabled + if check_datacite: + result['versionDOI_check'], base_doi_check = check_datacite_doi_registration(ctx, data_package, write_stdout) + if base_doi_check is not None: + result['baseDOI_check'] = base_doi_check + + result['landingPage_check'] = check_landingpage(ctx, data_package, offline, api_call) + publication_config = get_publication_config(ctx) + result['combiJson_check'] = check_combi_json(ctx, data_package, publication_config, offline, write_stdout) + + results[data_package] = result + + if not api_call: + print_troubleshoot_result(ctx, data_package, result, check_datacite) + + if log_file: + log_loc = "/var/lib/irods/log/troubleshoot_publications.log" + with open(log_loc, "a") as writer: + writer.writelines("Batch run date and time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) + writer.writelines('\n') + writer.writelines("Troubleshooting data package: {}".format(data_package)) + writer.writelines('\n') + json.dump(result, writer) + writer.writelines('\n') + + return results + + +@api.make() +def api_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline): + """ + Wrapper for the batch script for troubleshooting published data packages. + Runs a subset of the tests since "technicaladmin" is usually more restricted than "rods". + + :param ctx: Combined type of a callback and rei struct + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + + :returns: A dictionary of dictionaries providing the results of the job. + """ + return batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, True, False) + + +@rule.make(inputs=[0, 1, 2, 3], outputs=[]) +def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, no_datacite): + """ + Troubleshoots published data packages. + + Prints results of the following checks: + 1. Metadata schema compliance. + 2. Presence and correctness of expected AVUs. + 3. Registration with Data Cite. + 4. File integrity of landing page and combi JSON files. + + Operates on either a single specified package or all published packages, depending on the input. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A string boolean representing to write results in log. + :param offline: A string boolean representing whether to perform all checks without connecting to external servers. + :param no_datacite: A string boolean representing whether to skip the datacite checks + """ + offline = offline == "True" + log_file = log_file == "True" + check_datacite = no_datacite == "False" + + batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, False, check_datacite) diff --git a/schema_transformation.py b/schema_transformation.py index d7f7cc947..77299ada3 100644 --- a/schema_transformation.py +++ b/schema_transformation.py @@ -19,7 +19,6 @@ import session_vars import meta -import meta_form import schema import schema_transformations from util import * @@ -404,41 +403,13 @@ def rule_batch_vault_metadata_schema_report(ctx): genquery.AS_LIST, ctx) for row in iter: - coll_name = row[0] - metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name) - - if metadata_path == '' or metadata_path is None: - log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found." - % (coll_name)) - continue - try: - metadata = jsonutil.read(ctx, metadata_path) - except Exception as exc: - log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s." - % (coll_name, metadata_path, str(exc))) + coll_name = row[0] + result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True) + if result: + results[coll_name] = result + except Exception as e: + log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e))) continue - # Determine schema - schema_id = schema.get_schema_id(ctx, metadata_path) - schema_shortname = schema_id.split("/")[-2] - - # Retrieve schema and cache it for future use - schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id) - if schema_shortname in schema_cache: - schema_contents = schema_cache[schema_shortname] - else: - schema_contents = jsonutil.read(ctx, schema_path) - schema_cache[schema_shortname] = schema_contents - - # Check whether metadata matches schema and log any errors - error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) - match_schema = len(error_list) == 0 - if not match_schema: - log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" % - (metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]))) - - # Update results - results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema} - return json.dumps(results) diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature index 0039a709b..4ed3d018b 100644 --- a/tests/features/api/api_vault.feature +++ b/tests/features/api/api_vault.feature @@ -98,6 +98,17 @@ Feature: Vault API | /tempZone/home/vault-default-2 | | /tempZone/home/vault-core-2 | | /tempZone/home/vault-default-3 | + + + Scenario Outline: Published vault package passes troubleshooting script checks + Given user technicaladmin is authenticated + And data package exists in + Then data package in passes troubleshooting script checks + + Examples: + | vault | + | /tempZone/home/vault-default-2 | + | /tempZone/home/vault-default-3 | Scenario Outline: Vault preservable formats lists diff --git a/tests/step_defs/api/common_vault.py b/tests/step_defs/api/common_vault.py index 2cfa8fa55..9b2706221 100644 --- a/tests/step_defs/api/common_vault.py +++ b/tests/step_defs/api/common_vault.py @@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status): raise AssertionError() +@then(parsers.parse('data package in {vault} passes troubleshooting script checks')) +def api_vault_batch_troubleshoot(user, vault, data_package): + http_status, result = api_request( + user, + "batch_troubleshoot_published_data_packages", + {"requested_package": data_package, "log_file": True, "offline": True} + ) + assert http_status == 200 + data = result['data'] + assert len(data) == 1 + # Confirm that all checks passed for this data package + for checks in data.values(): + assert all(checks.values()) + + @then('preservable formats lists are returned') def preservable_formats_lists(api_response): http_status, body = api_response diff --git a/tools/troubleshoot-published-data.py b/tools/troubleshoot-published-data.py new file mode 100644 index 000000000..bba14bc72 --- /dev/null +++ b/tools/troubleshoot-published-data.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""This script collects all published packages and checks that they have all the required info. + +Example: +To check all published packages: +python3 troubleshoot-published-data.py + +To check one specific package by name: +python3 troubleshoot-published-data.py -p research-initial[1725262507] + +To put results into a log file and complete the checks offline: +python3 troubleshoot-published-data.py -l -o +""" +import argparse +import subprocess + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="troubleshoot-published-data.py", + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("-l", "--log-file", action='store_true', + help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log") + parser.add_argument("-o", "--offline", action='store_true', + help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).") + parser.add_argument("-n", "--no-datacite", action='store_true', + help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).") + parser.add_argument("-p", "--package", type=str, required=False, + help="Troubleshoot a specific data package by name (default: troubleshoot all packages)") + return parser.parse_args() + + +def main(): + args = parse_args() + rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r" + data_package = f"*data_package={args.package}" + log_loc = f"*log_loc={args.log_file if args.log_file else ''}" + offline = f"*offline={args.offline}" + no_datacite = f"*no_datacite={args.no_datacite}" + subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F', + rule_name, data_package, log_loc, offline, no_datacite]) + + +if __name__ == '__main__': + main() diff --git a/tools/troubleshoot_data.r b/tools/troubleshoot_data.r new file mode 100644 index 000000000..3caac4671 --- /dev/null +++ b/tools/troubleshoot_data.r @@ -0,0 +1,11 @@ +#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F + +def main(rule_args, callback, rei): + data_package = global_vars["*data_package"].strip('"') + log_loc = global_vars["*log_loc"].strip('"') + offline = global_vars["*offline"].strip('"') + no_datacite = global_vars["*no_datacite"].strip('"') + callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite) + +INPUT *data_package="", *log_loc="", *offline="", *no_datacite="" +OUTPUT ruleExecOut diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py index aa03ef2c2..428fa33e8 100644 --- a/unit-tests/test_util_misc.py +++ b/unit-tests/test_util_misc.py @@ -6,16 +6,181 @@ import sys import time -from collections import OrderedDict +from collections import namedtuple, OrderedDict from unittest import TestCase sys.path.append('../util') -from misc import human_readable_size, last_run_time_acceptable, remove_empty_objects +from misc import check_data_package_system_avus, human_readable_size, last_run_time_acceptable, remove_empty_objects + +# AVs of a successfully published data package, that is the first version of the package +avs_success_data_package = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/ICGVFV-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/ICGVFV-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/ICGVFV.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/ICGVFV.html", + "org_publication_lastModifiedDateTime": "2024-10-04T15:32:46.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-04T15:33:17.853806", + "org_publication_randomId": "ICGVFV", + "org_publication_status": "OK", + "org_publication_submission_actor": "researcher#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-default-3/research-default-3[1728048679]", + "org_publication_versionDOI": "10.00012/UU01-ICGVFV", + "org_publication_versionDOIMinted": "yes", +} + +avs_success_data_package_multiversion = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseDOIMinted": "yes", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/YU0JDH-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/YU0JDH-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/YU0JDH.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/YU0JDH.html", + "org_publication_lastModifiedDateTime": "2024-10-11T08:49:17.000000", + "org_publication_license": "Custom", + "org_publication_oaiUploaded": "yes", + "org_publication_previous_version": "/tempZone/home/vault-initial1/new-group01[1728550839]", + "org_publication_publicationDate": "2024-10-11T08:50:01.812220", + "org_publication_randomId": "YU0JDH", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728629336]", + "org_publication_versionDOI": "10.00012/UU01-YU0JDH", + "org_publication_versionDOIMinted": "yes" +} + +avs_success_data_package_multiversion_first = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/T8D8QU-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/T8D8QU-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/T8D8QU.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/T8D8QU.html", + "org_publication_lastModifiedDateTime": "2024-10-10T09:06:05.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_next_version": "/tempZone/home/vault-initial1/new-group01[1728545387]", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-10T09:06:02.177810", + "org_publication_randomId": "T8D8QU", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728543897]", + "org_publication_versionDOI": "10.00012/UU01-T8D8QU", + "org_publication_versionDOIMinted": "yes", +} + +# From avu.py +Avu = namedtuple('Avu', list('avu')) +Avu.attr = Avu.a +Avu.value = Avu.v +Avu.unit = Avu.u class UtilMiscTest(TestCase): + def test_check_data_package_system_avus(self): + # Success + avs = avs_success_data_package + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, extra optional avu + avs['org_publication_baseDOIAvailable'] = 'yes' + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + del avs['org_publication_baseDOIAvailable'] + + # Missing license Uri for non-custom license + del avs['org_publication_licenseUri'] + avus_missing_license_uri = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_license_uri) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Custom license, no license Uri (happy flow) + avs['org_publication_license'] = "Custom" + avus_custom_license = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_custom_license) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Unexpected + avs['org_publication_userAddedSomethingWeird'] = "yodayoda:)" + avus_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_unexpected) + self.assertTrue(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing and unexpected + del avs['org_publication_landingPagePath'] + avus_missing_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_unexpected) + self.assertFalse(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing + del avs['org_publication_userAddedSomethingWeird'] + avus_missing = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, latest version of a publication + avs = avs_success_data_package_multiversion + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, first version of a publication that has had other versions + avs = avs_success_data_package_multiversion_first + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + def test_last_run_time_acceptable(self): """Test the last run time for copy to vault""" # No last run time (job hasn't been tried before) diff --git a/util/avu.py b/util/avu.py index 470620403..0098fcea4 100644 --- a/util/avu.py +++ b/util/avu.py @@ -35,6 +35,18 @@ def of_coll(ctx, coll): "COLL_NAME = '{}'".format(coll))) +def get_attr_val_of_coll(ctx, coll, attr): + """Get the value corresponding to an attr for a given collection.""" + iter = genquery.Query( + ctx, + "META_COLL_ATTR_VALUE", + "META_COLL_ATTR_NAME = '{}' AND COLL_NAME = '{}'".format(attr, coll)) + + for row in iter: + return row + raise ValueError("Attribute {} not found in AVUs of collection {}".format(attr, coll)) + + def inside_coll(ctx, path, recursive=False): """Get a list of all AVUs inside a collection with corresponding paths. diff --git a/util/log.py b/util/log.py index 897b9562c..545e626ca 100644 --- a/util/log.py +++ b/util/log.py @@ -17,15 +17,20 @@ import user -def write(ctx, message): - """Write a message to the log, including client name and originating module. +def write(ctx, message, write_stdout=False): + """Write a message to the log or stdout. + Includes client name and originating module if writing to log. - :param ctx: Combined type of a callback and rei struct - :param message: Message to write to log + :param ctx: Combined type of a callback and rei struct + :param message: Message to write to log + :param write_stdout: Whether to write to stdout (used for a few of our scripts) """ - stack = inspect.stack()[1] - module = inspect.getmodule(stack[0]) - _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) + if write_stdout: + ctx.writeLine("stdout", message) + else: + stack = inspect.stack()[1] + module = inspect.getmodule(stack[0]) + _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) def _write(ctx, message): @@ -40,15 +45,6 @@ def _write(ctx, message): ctx.writeLine('serverLog', message) -def write_stdout(ctx, message): - """Write a message to stdout. Used for some of our scripts. - - :param ctx: Combined type of a callback and rei struct - :param message: Message to write to log - """ - ctx.writeLine("stdout", message) - - def debug(ctx, message): """"Write a message to the log, if in a development environment. diff --git a/util/misc.py b/util/misc.py index a7d1c4471..73b05d2e6 100644 --- a/util/misc.py +++ b/util/misc.py @@ -8,6 +8,88 @@ import time from collections import OrderedDict +import constants + + +def check_data_package_system_avus(extracted_avus): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + + :param extracted_avus: AVUs of the data package in AVU form + + :returns: Dictionary of the results of the check + """ + # Filter those starting with 'org_publication' + extracted_avs = {} + for m in extracted_avus: + if m.attr.startswith(constants.UUORGMETADATAPREFIX + 'publication_'): + extracted_avs[m.attr] = m.value + extracted_attrs = set(extracted_avs.keys()) + + # Define the set of ground truth AVUs + avu_names_suffix = { + 'approval_actor', 'randomId', + 'versionDOI', 'dataCiteJsonPath', 'license', + 'anonymousAccess', 'versionDOIMinted', + 'accessRestriction', 'landingPagePath', + 'publicationDate', + 'vaultPackage', 'submission_actor', 'status', + 'lastModifiedDateTime', 'combiJsonPath', + 'landingPageUploaded', 'oaiUploaded', + 'landingPageUrl', 'dataCiteMetadataPosted' + } + + # If the license is not Custom, it must have a licenseUri + if constants.UUORGMETADATAPREFIX + 'publication_license' in extracted_attrs: + if extracted_avs[constants.UUORGMETADATAPREFIX + 'publication_license'] != "Custom": + avu_names_suffix.add('licenseUri') + + # Define additional set of AVUs with more than one version of publication + avu_names_version_suffix = { + 'previous_version', 'baseDOI', 'baseRandomId', + 'baseDOIMinted' + } + + # Define additional set of AVUs expected for the first version of a publication, when there are multiple versions + avu_names_first_version_suffix = { + 'baseRandomId', 'baseDOI', 'next_version' + } + + # for the second version, all we need is next_version in addition to avu_names_version_suffix + avu_names_previous_version_suffix = {'next_version'} + + # optional avus + avu_names_optional_suffix = { + 'versionDOIAvailable', 'baseDOIAvailable' + } + + combined_avu_names_suffix = avu_names_suffix + + if constants.UUORGMETADATAPREFIX + 'publication_previous_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_version_suffix) + if constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_previous_version_suffix) + elif constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_first_version_suffix) + + ground_truth_avus = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + combined_avu_names_suffix.update(avu_names_optional_suffix) + ground_truth_avus_with_optional = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + # Find missing and unexpected AVUs + missing_avus = ground_truth_avus - extracted_attrs + unexpected_avus = extracted_attrs - ground_truth_avus_with_optional + + results = { + 'no_missing_avus': not bool(missing_avus), + 'missing_avus': list(missing_avus), + 'no_unexpected_avus': not bool(unexpected_avus), + 'unexpected_avus': list(unexpected_avus) + } + + return results + def last_run_time_acceptable(found, last_run, config_backoff_time): """Return whether the last run time is acceptable to continue with task."""